diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c407f84add..b5ae979d7ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ Docs: https://docs.openclaw.ai ### Changes - Plugins/tokenjuice: bump the bundled tokenjuice runtime to 0.6.3. Thanks @vincentkoc. +- TTS/agents: allow `agents.list[].tts` to override global + `messages.tts` for per-agent voices while keeping shared provider + credentials and preferences in the existing TTS config surface. - Providers/Azure Speech: add Azure Speech as a bundled TTS provider with Speech-resource auth, voice listing, SSML escaping, native Ogg/Opus voice-note output, and telephony output. (#51776) Thanks @leonchui. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index d716da5c85b..a1d33ab3cdc 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -211e9d4cdb309e7fe0c1ed91d060201240a9287f8c5cb3c893aba3f904a20d30 config-baseline.json -ffda2d2911adc03148a368f3b40b17cbdcb7af0066bccdc555e8d596cdea8cda config-baseline.core.json +3efb041739877bd5387ffc87e0ddd11be43d80d38e7779407ce8091dcb797e5e config-baseline.json +5c6e35c5846f654d717d4b20853649e0b45a746423834f539b2a2223abcd5226 config-baseline.core.json 7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json -9e131d7734f8b9cc9e7f8af6cc6b6dc81c9971dc551fadbe66fb0d682173f32d config-baseline.plugin.json +a5479c182ec987bb21e814b8a4e7b3bda7190ae5c2b35fd5ca403dfa48afa115 config-baseline.plugin.json diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 0ec8dfc602f..53c432b7837 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -c911117176b41eebf26470618274a7e093910e9b36855bc045bc8a92f6856745 plugin-sdk-api-baseline.json -ff360635f95beb217b9dd207a87eaf331319a7671aea03acfe05911756741b21 plugin-sdk-api-baseline.jsonl +6eb33044c2a4726f1aeb2d18052643c38c8bf5244bb970f969b1583365063e8b plugin-sdk-api-baseline.json +06e70516047f98d78963c238f1671feb3eea7c7e559c6fa84f403b9562028bb2 plugin-sdk-api-baseline.jsonl diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index 521dad4980f..eabb6e293fa 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -915,6 +915,11 @@ scripts/sandbox-browser-setup.sh # optional browser image fastModeDefault: false, // per-agent fast mode override embeddedHarness: { runtime: "auto", fallback: "pi" }, params: { cacheRetention: "none" }, // overrides matching defaults.models params by key + tts: { + providers: { + elevenlabs: { voiceId: "EXAVITQu4vr4xnSDxMaL" }, + }, + }, skills: ["docs-search"], // replaces agents.defaults.skills when set identity: { name: "Samantha", @@ -950,6 +955,7 @@ scripts/sandbox-browser-setup.sh # optional browser image - `default`: when multiple are set, first wins (warning logged). If none set, first list entry is default. - `model`: string form overrides `primary` only; object form `{ primary, fallbacks }` overrides both (`[]` disables global fallbacks). Cron jobs that only override `primary` still inherit default fallbacks unless you set `fallbacks: []`. - `params`: per-agent stream params merged over the selected model entry in `agents.defaults.models`. Use this for agent-specific overrides like `cacheRetention`, `temperature`, or `maxTokens` without duplicating the whole model catalog. +- `tts`: optional per-agent text-to-speech overrides. The block deep-merges over `messages.tts`, so keep shared provider credentials and fallback policy in `messages.tts` and set only persona-specific values such as provider, voice, model, style, or auto mode here. - `skills`: optional per-agent skill allowlist. If omitted, the agent inherits `agents.defaults.skills` when set; an explicit list replaces defaults instead of merging, and `[]` means no skills. - `thinkingDefault`: optional per-agent default thinking level (`off | minimal | low | medium | high | xhigh | adaptive | max`). Overrides `agents.defaults.thinkingDefault` for this agent when no per-message or session override is set. The selected provider/model profile controls which values are valid; for Google Gemini, `adaptive` keeps provider-owned dynamic thinking (`thinkingLevel` omitted on Gemini 3/3.1, `thinkingBudget: -1` on Gemini 2.5). - `reasoningDefault`: optional per-agent default reasoning visibility (`on | off | stream`). Applies when no per-message or session reasoning override is set. diff --git a/docs/reference/secretref-credential-surface.md b/docs/reference/secretref-credential-surface.md index 77904decad4..b0061d7b512 100644 --- a/docs/reference/secretref-credential-surface.md +++ b/docs/reference/secretref-credential-surface.md @@ -35,6 +35,7 @@ Scope intent: - `models.providers.*.request.tls.passphrase` - `skills.entries.*.apiKey` - `agents.defaults.memorySearch.remote.apiKey` +- `agents.list[].tts.providers.*.apiKey` - `agents.list[].memorySearch.remote.apiKey` - `talk.providers.*.apiKey` - `messages.tts.providers.*.apiKey` diff --git a/docs/reference/secretref-user-supplied-credentials-matrix.json b/docs/reference/secretref-user-supplied-credentials-matrix.json index 9f427138e28..f44221000f1 100644 --- a/docs/reference/secretref-user-supplied-credentials-matrix.json +++ b/docs/reference/secretref-user-supplied-credentials-matrix.json @@ -29,6 +29,13 @@ "secretShape": "secret_input", "optIn": true }, + { + "id": "agents.list[].tts.providers.*.apiKey", + "configFile": "openclaw.json", + "path": "agents.list[].tts.providers.*.apiKey", + "secretShape": "secret_input", + "optIn": true + }, { "id": "auth-profiles.api_key.key", "configFile": "auth-profiles.json", diff --git a/docs/tools/tts.md b/docs/tools/tts.md index c8a8c20f98c..ef19241702d 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -109,6 +109,50 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Per-agent voice overrides + +Use `agents.list[].tts` when one agent should speak with a different provider, +voice, model, style, or auto-TTS mode. The agent block deep-merges over +`messages.tts`, so provider credentials can stay in the global provider config. + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "elevenlabs", + providers: { + elevenlabs: { + apiKey: "${ELEVENLABS_API_KEY}", + model: "eleven_multilingual_v2", + }, + }, + }, + }, + agents: { + list: [ + { + id: "reader", + tts: { + providers: { + elevenlabs: { + voiceId: "EXAVITQu4vr4xnSDxMaL", + }, + }, + }, + }, + ], + }, +} +``` + +Precedence for automatic replies is: + +1. `messages.tts` +2. active `agents.list[].tts` +3. local `/tts` preferences for this host +4. inline `[[tts:...]]` directives when model overrides are enabled + ### OpenAI primary with ElevenLabs fallback ```json5 @@ -702,7 +746,8 @@ Stored fields: - `maxLength` (summary threshold; default 1500 chars) - `summarize` (default `true`) -These override `messages.tts.*` for that host. +These override the effective config from `messages.tts` plus the active +`agents.list[].tts` block for that host. ## Output formats (fixed) diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index b13aff709c2..149ecb84a31 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -49,7 +49,7 @@ vi.mock("../api.js", async () => { }; }); -const { _test, maybeApplyTtsToPayload } = await import("./tts.js"); +const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js"); const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const; @@ -158,3 +158,82 @@ describe("speech-core native voice-note routing", () => { }); }); }); + +describe("speech-core per-agent TTS config", () => { + it("deep-merges the active agent TTS override over messages.tts", () => { + const cfg = { + messages: { + tts: { + enabled: true, + provider: "openai", + providers: { + openai: { + apiKey: "${OPENAI_API_KEY}", + voice: "coral", + speed: 1, + }, + }, + }, + }, + agents: { + list: [ + { + id: "reader", + tts: { + provider: "openai", + providers: { + openai: { + voice: "nova", + }, + }, + }, + }, + ], + }, + } satisfies OpenClawConfig; + + const resolved = resolveTtsConfig(cfg, "reader"); + + expect(resolved.rawConfig).toMatchObject({ + enabled: true, + provider: "openai", + providers: { + openai: { + apiKey: "${OPENAI_API_KEY}", + voice: "nova", + speed: 1, + }, + }, + }); + }); + + it("ignores prototype-pollution keys in agent TTS overrides", () => { + const cfg = { + messages: { + tts: { + provider: "openai", + providers: { + openai: { + voice: "coral", + }, + }, + }, + }, + agents: { + list: [ + { + id: "reader", + tts: JSON.parse( + '{"providers":{"openai":{"voice":"nova","__proto__":{"polluted":true}}}}', + ), + }, + ], + }, + } as OpenClawConfig; + + const resolved = resolveTtsConfig(cfg, "reader"); + + expect(resolved.rawConfig?.providers?.openai).toEqual({ voice: "nova" }); + expect(({} as Record).polluted).toBeUndefined(); + }); +}); diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 47f78df91dc..0a29d0b1b8a 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -62,6 +62,7 @@ const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; const DEFAULT_MAX_TEXT_LENGTH = 4096; +const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]); type TtsUserPrefs = { tts?: { @@ -240,6 +241,48 @@ function resolveRawProviderConfig( return asProviderConfig(direct); } +function isPlainObject(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function deepMergeDefined(base: unknown, override: unknown): unknown { + if (!isPlainObject(base) || !isPlainObject(override)) { + return override === undefined ? base : override; + } + + const result: Record = { ...base }; + for (const [key, value] of Object.entries(override)) { + if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) { + continue; + } + const existing = result[key]; + result[key] = key in result ? deepMergeDefined(existing, value) : value; + } + return result; +} + +function normalizeAgentConfigId(value: string | undefined | null): string { + return normalizeLowercaseStringOrEmpty(value); +} + +function resolveAgentTtsOverride( + cfg: OpenClawConfig, + agentId: string | undefined, +): TtsConfig | undefined { + if (!agentId || !Array.isArray(cfg.agents?.list)) { + return undefined; + } + const normalized = normalizeAgentConfigId(agentId); + const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized); + return agent?.tts; +} + +function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig { + const base = cfg.messages?.tts ?? {}; + const override = resolveAgentTtsOverride(cfg, agentId); + return deepMergeDefined(base, override ?? {}) as TtsConfig; +} + function resolveLazyProviderConfig( config: ResolvedTtsConfig, providerId: string, @@ -313,8 +356,8 @@ export function getResolvedSpeechProviderConfig( return resolveLazyProviderConfig(config, canonical, cfg); } -export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { - const raw: TtsConfig = cfg.messages?.tts ?? {}; +export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): ResolvedTtsConfig { + const raw: TtsConfig = resolveEffectiveTtsRawConfig(cfg, agentId); const providerSource = raw.provider ? "config" : "default"; const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS; const auto = resolveConfiguredTtsAutoMode(raw); @@ -367,11 +410,15 @@ export function resolveTtsAutoMode(params: { return params.config.auto; } -function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto?: string }): { +function resolveEffectiveTtsAutoState(params: { + cfg: OpenClawConfig; + sessionAuto?: string; + agentId?: string; +}): { autoMode: TtsAutoMode; prefsPath: string; } { - const raw: TtsConfig = params.cfg.messages?.tts ?? {}; + const raw: TtsConfig = resolveEffectiveTtsRawConfig(params.cfg, params.agentId); const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath); const sessionAuto = normalizeTtsAutoMode(params.sessionAuto); if (sessionAuto) { @@ -387,12 +434,15 @@ function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto }; } -export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined { - const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg }); +export function buildTtsSystemPromptHint( + cfg: OpenClawConfig, + agentId?: string, +): string | undefined { + const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg, agentId }); if (autoMode === "off") { return undefined; } - const _config = resolveTtsConfig(cfg); + const _config = resolveTtsConfig(cfg, agentId); const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; const autoHint = @@ -504,11 +554,12 @@ export function resolveExplicitTtsOverrides(params: { provider?: string; modelId?: string; voiceId?: string; + agentId?: string; }): TtsDirectiveOverrides { const providerInput = params.provider?.trim(); const modelId = params.modelId?.trim(); const voiceId = params.voiceId?.trim(); - const config = resolveTtsConfig(params.cfg); + const config = resolveTtsConfig(params.cfg, params.agentId); const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); const selectedProvider = canonicalizeSpeechProviderId(providerInput, params.cfg) ?? @@ -741,6 +792,7 @@ function resolveTtsRequestSetup(params: { prefsPath?: string; providerOverride?: TtsProvider; disableFallback?: boolean; + agentId?: string; }): | { config: ResolvedTtsConfig; @@ -749,7 +801,7 @@ function resolveTtsRequestSetup(params: { | { error: string; } { - const config = resolveTtsConfig(params.cfg); + const config = resolveTtsConfig(params.cfg, params.agentId); const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); if (params.text.length > config.maxTextLength) { return { @@ -774,6 +826,7 @@ export async function textToSpeech(params: { overrides?: TtsDirectiveOverrides; disableFallback?: boolean; timeoutMs?: number; + agentId?: string; }): Promise { const synthesis = await synthesizeSpeech(params); if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) { @@ -819,6 +872,7 @@ export async function synthesizeSpeech(params: { overrides?: TtsDirectiveOverrides; disableFallback?: boolean; timeoutMs?: number; + agentId?: string; }): Promise { const setup = resolveTtsRequestSetup({ text: params.text, @@ -826,6 +880,7 @@ export async function synthesizeSpeech(params: { prefsPath: params.prefsPath, providerOverride: params.overrides?.provider, disableFallback: params.disableFallback, + agentId: params.agentId, }); if ("error" in setup) { return { success: false, error: setup.error }; @@ -1064,6 +1119,7 @@ export async function maybeApplyTtsToPayload(params: { kind?: "tool" | "block" | "final"; inboundAudio?: boolean; ttsAuto?: string; + agentId?: string; }): Promise { if (params.payload.isCompactionNotice) { return params.payload; @@ -1071,11 +1127,12 @@ export async function maybeApplyTtsToPayload(params: { const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg: params.cfg, sessionAuto: params.ttsAuto, + agentId: params.agentId, }); if (autoMode === "off") { return params.payload; } - const config = resolveTtsConfig(params.cfg); + const config = resolveTtsConfig(params.cfg, params.agentId); const activeProvider = getTtsProvider(config, prefsPath); const reply = resolveSendableOutboundReplyParts(params.payload); @@ -1183,6 +1240,7 @@ export async function maybeApplyTtsToPayload(params: { prefsPath, channel: params.channel, overrides: directives.overrides, + agentId: params.agentId, }); if (result.success && result.audioPath) { diff --git a/src/agents/agent-scope-config.ts b/src/agents/agent-scope-config.ts index df3fdef9649..5691180d621 100644 --- a/src/agents/agent-scope-config.ts +++ b/src/agents/agent-scope-config.ts @@ -25,6 +25,7 @@ export type ResolvedAgentConfig = { skills?: AgentEntry["skills"]; memorySearch?: AgentEntry["memorySearch"]; humanDelay?: AgentEntry["humanDelay"]; + tts?: AgentEntry["tts"]; contextLimits?: AgentContextLimitsConfig; heartbeat?: AgentEntry["heartbeat"]; identity?: AgentEntry["identity"]; @@ -123,6 +124,7 @@ export function resolveAgentConfig( skills: Array.isArray(entry.skills) ? entry.skills : undefined, memorySearch: entry.memorySearch, humanDelay: entry.humanDelay, + tts: entry.tts, contextLimits: typeof entry.contextLimits === "object" && entry.contextLimits ? { ...agentDefaults?.contextLimits, ...entry.contextLimits } diff --git a/src/agents/agent-scope.test.ts b/src/agents/agent-scope.test.ts index d303f541e9f..ea2825a9c60 100644 --- a/src/agents/agent-scope.test.ts +++ b/src/agents/agent-scope.test.ts @@ -65,6 +65,7 @@ describe("resolveAgentConfig", () => { groupChat: undefined, subagents: undefined, sandbox: undefined, + tts: undefined, tools: undefined, }); }); diff --git a/src/agents/cli-runner/helpers.ts b/src/agents/cli-runner/helpers.ts index 4a1b550ea65..332d110c250 100644 --- a/src/agents/cli-runner/helpers.ts +++ b/src/agents/cli-runner/helpers.ts @@ -99,7 +99,9 @@ export function buildSystemPrompt(params: { shell: detectRuntimeShell(), }, }); - const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined; + const ttsHint = params.config + ? buildTtsSystemPromptHint(params.config, params.agentId) + : undefined; const ownerDisplay = resolveOwnerDisplaySetting(params.config); return buildAgentSystemPrompt({ workspaceDir: params.workspaceDir, diff --git a/src/agents/pi-embedded-runner/compact.ts b/src/agents/pi-embedded-runner/compact.ts index 85491675fbd..43688dda31c 100644 --- a/src/agents/pi-embedded-runner/compact.ts +++ b/src/agents/pi-embedded-runner/compact.ts @@ -722,7 +722,9 @@ export async function compactEmbeddedPiSessionDirect( cwd: effectiveWorkspace, moduleUrl: import.meta.url, }); - const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined; + const ttsHint = params.config + ? buildTtsSystemPromptHint(params.config, sessionAgentId) + : undefined; const ownerDisplay = resolveOwnerDisplaySetting(params.config); const promptContributionContext: Parameters< AgentRuntimePlan["prompt"]["resolveSystemPromptContribution"] diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 5c2fefd28e4..d58a5b7d1c6 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -1065,7 +1065,9 @@ export async function runEmbeddedAttempt( cwd: effectiveWorkspace, moduleUrl: import.meta.url, }); - const ttsHint = params.config ? buildTtsSystemPromptHint(params.config) : undefined; + const ttsHint = params.config + ? buildTtsSystemPromptHint(params.config, sessionAgentId) + : undefined; const ownerDisplay = resolveOwnerDisplaySetting(params.config); const heartbeatPrompt = shouldInjectHeartbeatPrompt({ config: params.config, diff --git a/src/auto-reply/reply/commands-system-prompt.ts b/src/auto-reply/reply/commands-system-prompt.ts index c79d25384f3..04d11e12d63 100644 --- a/src/auto-reply/reply/commands-system-prompt.ts +++ b/src/auto-reply/reply/commands-system-prompt.ts @@ -146,7 +146,7 @@ export async function resolveCommandsSystemPromptBundle( }, } : { enabled: false }; - const ttsHint = params.cfg ? buildTtsSystemPromptHint(params.cfg) : undefined; + const ttsHint = params.cfg ? buildTtsSystemPromptHint(params.cfg, sessionAgentId) : undefined; const systemPrompt = buildAgentSystemPrompt({ workspaceDir, diff --git a/src/auto-reply/reply/dispatch-acp-delivery.ts b/src/auto-reply/reply/dispatch-acp-delivery.ts index c1a16816404..a31ff162aee 100644 --- a/src/auto-reply/reply/dispatch-acp-delivery.ts +++ b/src/auto-reply/reply/dispatch-acp-delivery.ts @@ -88,6 +88,7 @@ async function shouldTreatDeliveredTextAsVisible(params: { async function maybeApplyAcpTts(params: { payload: ReplyPayload; cfg: OpenClawConfig; + agentId?: string; channel?: string; kind: ReplyDispatchKind; inboundAudio: boolean; @@ -100,6 +101,7 @@ async function maybeApplyAcpTts(params: { const ttsStatus = resolveStatusTtsSnapshot({ cfg: params.cfg, sessionAuto: params.ttsAuto, + agentId: params.agentId, }); if (!ttsStatus) { return params.payload; @@ -107,7 +109,7 @@ async function maybeApplyAcpTts(params: { if (ttsStatus.autoMode === "inbound" && !params.inboundAudio) { return params.payload; } - if (params.kind !== "final" && resolveConfiguredTtsMode(params.cfg) === "final") { + if (params.kind !== "final" && resolveConfiguredTtsMode(params.cfg, params.agentId) === "final") { return params.payload; } const { maybeApplyTtsToPayload } = await loadDispatchAcpTtsRuntime(); @@ -118,6 +120,7 @@ async function maybeApplyAcpTts(params: { kind: params.kind, inboundAudio: params.inboundAudio, ttsAuto: params.ttsAuto, + agentId: params.agentId, }); } @@ -153,6 +156,7 @@ export type AcpDispatchDeliveryCoordinator = { export function createAcpDispatchDeliveryCoordinator(params: { cfg: OpenClawConfig; + agentId?: string; ctx: FinalizedMsgContext; dispatcher: ReplyDispatcher; inboundAudio: boolean; @@ -294,6 +298,7 @@ export function createAcpDispatchDeliveryCoordinator(params: { const ttsPayload = await maybeApplyAcpTts({ payload, cfg: params.cfg, + agentId: params.agentId, channel: params.ttsChannel, kind, inboundAudio: params.inboundAudio, diff --git a/src/auto-reply/reply/dispatch-acp.ts b/src/auto-reply/reply/dispatch-acp.ts index aedc38720b4..b8e7d4ff359 100644 --- a/src/auto-reply/reply/dispatch-acp.ts +++ b/src/auto-reply/reply/dispatch-acp.ts @@ -186,6 +186,7 @@ async function maybeUnbindStaleBoundConversations(params: { async function finalizeAcpTurnOutput(params: { cfg: OpenClawConfig; sessionKey: string; + agentId: string; delivery: AcpDispatchDeliveryCoordinator; inboundAudio: boolean; sessionTtsAuto?: TtsAutoMode; @@ -195,12 +196,13 @@ async function finalizeAcpTurnOutput(params: { await params.delivery.settleVisibleText(); let queuedFinal = params.delivery.hasDeliveredVisibleText() && !params.delivery.hasFailedVisibleTextDelivery(); - const ttsMode = resolveConfiguredTtsMode(params.cfg); + const ttsMode = resolveConfiguredTtsMode(params.cfg, params.agentId); const accumulatedBlockText = params.delivery.getAccumulatedBlockText(); const hasAccumulatedBlockText = accumulatedBlockText.trim().length > 0; const ttsStatus = resolveStatusTtsSnapshot({ cfg: params.cfg, sessionAuto: params.sessionTtsAuto, + agentId: params.agentId, }); const canAttemptFinalTts = ttsStatus != null && !(ttsStatus.autoMode === "inbound" && !params.inboundAudio); @@ -216,6 +218,7 @@ async function finalizeAcpTurnOutput(params: { kind: "final", inboundAudio: params.inboundAudio, ttsAuto: params.sessionTtsAuto, + agentId: params.agentId, }); if (ttsSyntheticReply.mediaUrl) { const delivered = await params.delivery.deliver("final", { @@ -308,10 +311,12 @@ export async function tryDispatchAcpReply(params: { return null; } const canonicalSessionKey = acpResolution.sessionKey; + const acpAgentId = resolveAgentIdFromSessionKey(canonicalSessionKey); let queuedFinal = false; const delivery = createAcpDispatchDeliveryCoordinator({ cfg: params.cfg, + agentId: acpAgentId, ctx: params.ctx, dispatcher: params.dispatcher, inboundAudio: params.inboundAudio, @@ -476,6 +481,7 @@ export async function tryDispatchAcpReply(params: { (await finalizeAcpTurnOutput({ cfg: params.cfg, sessionKey: canonicalSessionKey, + agentId: acpAgentId, delivery, inboundAudio: params.inboundAudio, sessionTtsAuto: params.sessionTtsAuto, diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 9957f6e1f8a..73d697b91e2 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -119,7 +119,9 @@ function loadReplyMediaPathsRuntime() { async function maybeApplyTtsToReplyPayload( params: Parameters>["maybeApplyTtsToPayload"]>[0], ) { - if (!shouldAttemptTtsPayload({ cfg: params.cfg, ttsAuto: params.ttsAuto })) { + if ( + !shouldAttemptTtsPayload({ cfg: params.cfg, ttsAuto: params.ttsAuto, agentId: params.agentId }) + ) { return params.payload; } const { maybeApplyTtsToPayload } = await loadTtsRuntime(); @@ -729,6 +731,7 @@ export async function dispatchReplyFromConfig( kind: "final", inboundAudio, ttsAuto: sessionTtsAuto, + agentId: sessionAgentId, }); const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload); const result = await routeReplyToOriginating(normalizedPayload); @@ -996,6 +999,7 @@ export async function dispatchReplyFromConfig( kind: "tool", inboundAudio, ttsAuto: sessionTtsAuto, + agentId: sessionAgentId, }); const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload); const deliveryPayload = resolveToolDeliveryPayload(normalizedPayload); @@ -1097,6 +1101,7 @@ export async function dispatchReplyFromConfig( kind: "block", inboundAudio, ttsAuto: sessionTtsAuto, + agentId: sessionAgentId, }); const normalizedPayload = await normalizeReplyMediaPayload(ttsPayload); if (shouldRouteToOriginating) { @@ -1167,7 +1172,7 @@ export async function dispatchReplyFromConfig( routedFinalCount += finalReply.routedFinalCount; } - const ttsMode = resolveConfiguredTtsMode(cfg); + const ttsMode = resolveConfiguredTtsMode(cfg, sessionAgentId); // Generate TTS-only reply after block streaming completes (when there's no final reply). // This handles the case where block streaming succeeds and drops final payloads, // but we still want TTS audio to be generated from the accumulated block content. @@ -1185,6 +1190,7 @@ export async function dispatchReplyFromConfig( kind: "final", inboundAudio, ttsAuto: sessionTtsAuto, + agentId: sessionAgentId, }); // Only send if TTS was actually applied (mediaUrl exists) if (ttsSyntheticReply.mediaUrl) { diff --git a/src/cli/command-secret-targets.ts b/src/cli/command-secret-targets.ts index 45a22ab0dd2..9bbed1c5707 100644 --- a/src/cli/command-secret-targets.ts +++ b/src/cli/command-secret-targets.ts @@ -27,6 +27,7 @@ const STATIC_AGENT_RUNTIME_BASE_TARGET_IDS = [ ...STATIC_MODEL_TARGET_IDS, "agents.defaults.memorySearch.remote.apiKey", "agents.list[].memorySearch.remote.apiKey", + "agents.list[].tts.providers.*.apiKey", "messages.tts.providers.*.apiKey", "skills.entries.*.apiKey", "tools.web.search.apiKey", diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 9fad1353469..5721ac38ac4 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -6531,6 +6531,177 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { }, additionalProperties: false, }, + tts: { + type: "object", + properties: { + auto: { + type: "string", + enum: ["off", "always", "inbound", "tagged"], + }, + enabled: { + type: "boolean", + }, + mode: { + type: "string", + enum: ["final", "all"], + }, + provider: { + type: "string", + minLength: 1, + }, + summaryModel: { + type: "string", + }, + modelOverrides: { + type: "object", + properties: { + enabled: { + type: "boolean", + }, + allowText: { + type: "boolean", + }, + allowProvider: { + type: "boolean", + }, + allowVoice: { + type: "boolean", + }, + allowModelId: { + type: "boolean", + }, + allowVoiceSettings: { + type: "boolean", + }, + allowNormalization: { + type: "boolean", + }, + allowSeed: { + type: "boolean", + }, + }, + additionalProperties: false, + }, + providers: { + type: "object", + propertyNames: { + type: "string", + }, + additionalProperties: { + type: "object", + properties: { + apiKey: { + anyOf: [ + { + type: "string", + }, + { + oneOf: [ + { + type: "object", + properties: { + source: { + type: "string", + const: "env", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + pattern: "^[A-Z][A-Z0-9_]{0,127}$", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + { + type: "object", + properties: { + source: { + type: "string", + const: "file", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + { + type: "object", + properties: { + source: { + type: "string", + const: "exec", + }, + provider: { + type: "string", + pattern: "^[a-z][a-z0-9_-]{0,63}$", + }, + id: { + type: "string", + }, + }, + required: ["source", "provider", "id"], + additionalProperties: false, + }, + ], + }, + ], + }, + }, + additionalProperties: { + anyOf: [ + { + type: "string", + }, + { + type: "number", + }, + { + type: "boolean", + }, + { + type: "null", + }, + { + type: "array", + items: {}, + }, + { + type: "object", + propertyNames: { + type: "string", + }, + additionalProperties: {}, + }, + ], + }, + }, + }, + prefsPath: { + type: "string", + }, + maxTextLength: { + type: "integer", + minimum: 1, + maximum: 9007199254740991, + }, + timeoutMs: { + type: "integer", + minimum: 1000, + maximum: 120000, + }, + }, + additionalProperties: false, + }, skillsLimits: { type: "object", properties: { @@ -27586,6 +27757,10 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { sensitive: true, tags: ["security", "auth"], }, + "agents.list[].tts.providers.*.apiKey": { + sensitive: true, + tags: ["security", "auth", "media"], + }, "agents.list[].sandbox.ssh.identityData": { sensitive: true, tags: ["security", "storage"], diff --git a/src/config/types.agents.ts b/src/config/types.agents.ts index 9a89144afaa..44e08ab7036 100644 --- a/src/config/types.agents.ts +++ b/src/config/types.agents.ts @@ -13,6 +13,7 @@ import type { DmScope, HumanDelayConfig, IdentityConfig } from "./types.base.js" import type { GroupChatConfig } from "./types.messages.js"; import type { SkillsLimitsConfig } from "./types.skills.js"; import type { AgentToolsConfig, MemorySearchConfig } from "./types.tools.js"; +import type { TtsConfig } from "./types.tts.js"; export type AgentRuntimeAcpConfig = { /** ACP harness adapter id (for example codex, claude). */ @@ -95,6 +96,8 @@ export type AgentConfig = { memorySearch?: MemorySearchConfig; /** Human-like delay between block replies for this agent. */ humanDelay?: HumanDelayConfig; + /** Optional per-agent TTS overrides, deep-merged over messages.tts. */ + tts?: TtsConfig; /** Optional per-agent skills subsystem overrides. */ skillsLimits?: Pick; /** Optional per-agent overrides for selected context/token-heavy limits. */ diff --git a/src/config/zod-schema.agent-defaults.test.ts b/src/config/zod-schema.agent-defaults.test.ts index f49eff633f4..ad4dd60019a 100644 --- a/src/config/zod-schema.agent-defaults.test.ts +++ b/src/config/zod-schema.agent-defaults.test.ts @@ -140,6 +140,25 @@ describe("agent defaults schema", () => { expect(agent.heartbeat?.timeoutSeconds).toBe(45); }); + it("accepts per-agent TTS overrides", () => { + const agent = AgentEntrySchema.parse({ + id: "reader", + tts: { + provider: "openai", + auto: "always", + providers: { + openai: { + voice: "nova", + apiKey: "${OPENAI_API_KEY}", + }, + }, + }, + }); + + expect(agent.tts?.provider).toBe("openai"); + expect(agent.tts?.providers?.openai?.voice).toBe("nova"); + }); + it("rejects zero heartbeat timeoutSeconds", () => { expect(() => AgentDefaultsSchema.parse({ heartbeat: { timeoutSeconds: 0 } })).toThrow(); expect(() => AgentEntrySchema.parse({ id: "ops", heartbeat: { timeoutSeconds: 0 } })).toThrow(); diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index ff8a1e40c3b..e573681c221 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -13,6 +13,7 @@ import { SecretInputSchema, ToolsLinksSchema, ToolsMediaSchema, + TtsConfigSchema, } from "./zod-schema.core.js"; import { sensitive } from "./zod-schema.sensitive.js"; @@ -828,6 +829,7 @@ export const AgentEntrySchema = z skills: z.array(z.string()).optional(), memorySearch: MemorySearchSchema, humanDelay: HumanDelaySchema.optional(), + tts: TtsConfigSchema, skillsLimits: AgentSkillsLimitsSchema, contextLimits: AgentContextLimitsSchema, contextTokens: z.number().int().positive().optional(), diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 09a8cac2ea1..99f8b8d1207 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -62,6 +62,7 @@ export type ResolveExplicitTtsOverridesParams = { provider?: string; modelId?: string; voiceId?: string; + agentId?: string; }; export type TtsRequestParams = { @@ -72,6 +73,7 @@ export type TtsRequestParams = { overrides?: TtsDirectiveOverrides; disableFallback?: boolean; timeoutMs?: number; + agentId?: string; }; export type TtsTelephonyRequestParams = { @@ -95,6 +97,7 @@ export type MaybeApplyTtsToPayloadParams = { kind?: "tool" | "block" | "final"; inboundAudio?: boolean; ttsAuto?: string; + agentId?: string; }; export type TtsTestFacade = { @@ -168,7 +171,7 @@ export type ListSpeechVoices = (params: ListSpeechVoicesParams) => Promise string | undefined; + buildTtsSystemPromptHint: (cfg: OpenClawConfig, agentId?: string) => string | undefined; getLastTtsAttempt: () => TtsStatusEntry | undefined; getResolvedSpeechProviderConfig: ( config: ResolvedTtsConfig, @@ -188,7 +191,7 @@ export type TtsRuntimeFacade = { maybeApplyTtsToPayload: (params: MaybeApplyTtsToPayloadParams) => Promise; resolveExplicitTtsOverrides: (params: ResolveExplicitTtsOverridesParams) => TtsDirectiveOverrides; resolveTtsAutoMode: (params: ResolveTtsAutoModeParams) => TtsAutoMode; - resolveTtsConfig: (cfg: OpenClawConfig) => ResolvedTtsConfig; + resolveTtsConfig: (cfg: OpenClawConfig, agentId?: string) => ResolvedTtsConfig; resolveTtsPrefsPath: (config: ResolvedTtsConfig) => string; resolveTtsProviderOrder: (primary: TtsProvider, cfg?: OpenClawConfig) => TtsProvider[]; setLastTtsAttempt: (entry: TtsStatusEntry | undefined) => void; diff --git a/src/secrets/runtime-config-collectors-core.ts b/src/secrets/runtime-config-collectors-core.ts index 8ce5183004d..60aac595e57 100644 --- a/src/secrets/runtime-config-collectors-core.ts +++ b/src/secrets/runtime-config-collectors-core.ts @@ -506,6 +506,29 @@ function collectMessagesTtsAssignments(params: { }); } +function collectAgentTtsAssignments(params: { + config: OpenClawConfig; + defaults: SecretDefaults | undefined; + context: ResolverContext; +}): void { + const agents = params.config.agents as Record | undefined; + const list = agents?.list; + if (!Array.isArray(list)) { + return; + } + for (const [index, entry] of list.entries()) { + if (!isRecord(entry) || !isRecord(entry.tts)) { + continue; + } + collectTtsApiKeyAssignments({ + tts: entry.tts, + pathPrefix: `agents.list.${index}.tts`, + defaults: params.defaults, + context: params.context, + }); + } +} + function collectCronAssignments(params: { config: OpenClawConfig; defaults: SecretDefaults | undefined; @@ -640,6 +663,7 @@ export function collectCoreConfigAssignments(params: { collectGatewayAssignments(params); collectSandboxSshAssignments(params); collectMessagesTtsAssignments(params); + collectAgentTtsAssignments(params); collectCronAssignments(params); collectMediaRequestAssignments(params); } diff --git a/src/secrets/target-registry-data.ts b/src/secrets/target-registry-data.ts index 968b365023a..da2174213ac 100644 --- a/src/secrets/target-registry-data.ts +++ b/src/secrets/target-registry-data.ts @@ -204,6 +204,18 @@ const CORE_SECRET_TARGET_REGISTRY: SecretTargetRegistryEntry[] = [ includeInAudit: true, providerIdPathSegmentIndex: 3, }, + { + id: "agents.list[].tts.providers.*.apiKey", + targetType: "agents.list[].tts.providers.*.apiKey", + configFile: "openclaw.json", + pathPattern: "agents.list[].tts.providers.*.apiKey", + secretShape: SECRET_INPUT_SHAPE, + expectedResolvedValue: "string", + includeInPlan: true, + includeInConfigure: false, + includeInAudit: true, + providerIdPathSegmentIndex: 4, + }, { id: "models.providers.*.apiKey", targetType: "models.providers.apiKey", diff --git a/src/status/status-message.ts b/src/status/status-message.ts index 612960f7ad6..7520123c7e6 100644 --- a/src/status/status-message.ts +++ b/src/status/status-message.ts @@ -451,6 +451,7 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray { if (!config) { return null; @@ -458,6 +459,7 @@ const formatVoiceModeLine = ( const snapshot = resolveStatusTtsSnapshot({ cfg: config, sessionAuto: sessionEntry?.ttsAuto, + agentId, }); if (!snapshot) { return null; @@ -890,7 +892,7 @@ export function buildStatusMessage(args: StatusArgs): string { const usageCostLine = usagePair && costLine ? `${usagePair} ยท ${costLine}` : (usagePair ?? costLine); const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions); - const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry); + const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry, args.agentId); return [ versionLine, diff --git a/src/tts/status-config.test.ts b/src/tts/status-config.test.ts index c4a9a9166fc..8ec0f33ce37 100644 --- a/src/tts/status-config.test.ts +++ b/src/tts/status-config.test.ts @@ -104,6 +104,40 @@ describe("resolveStatusTtsSnapshot", () => { }); }); + it("reports per-agent TTS overrides", async () => { + await withStatusTempHome(async () => { + expect( + resolveStatusTtsSnapshot({ + cfg: { + messages: { + tts: { + auto: "off", + provider: "openai", + }, + }, + agents: { + list: [ + { + id: "reader", + tts: { + auto: "always", + provider: "elevenlabs", + }, + }, + ], + }, + } as OpenClawConfig, + agentId: "reader", + }), + ).toEqual({ + autoMode: "always", + provider: "elevenlabs", + maxLength: 1500, + summarize: true, + }); + }); + }); + it("derives the default prefs path from OPENCLAW_CONFIG_PATH when set", async () => { await withStatusTempHome(async (home) => { const stateDir = path.join(home, ".openclaw-dev"); diff --git a/src/tts/status-config.ts b/src/tts/status-config.ts index 810d09bac09..2d5509001ad 100644 --- a/src/tts/status-config.ts +++ b/src/tts/status-config.ts @@ -8,6 +8,7 @@ import { } from "../shared/string-coerce.js"; import { resolveConfigDir, resolveUserPath } from "../utils.js"; import { normalizeTtsAutoMode } from "./tts-auto-mode.js"; +import { resolveEffectiveTtsConfig } from "./tts-config.js"; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; @@ -80,8 +81,9 @@ function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefin export function resolveStatusTtsSnapshot(params: { cfg: OpenClawConfig; sessionAuto?: string; + agentId?: string; }): TtsStatusSnapshot | null { - const raw: TtsConfig = params.cfg.messages?.tts ?? {}; + const raw: TtsConfig = resolveEffectiveTtsConfig(params.cfg, params.agentId); const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath); const prefs = readPrefs(prefsPath); const autoMode = diff --git a/src/tts/tts-config.test.ts b/src/tts/tts-config.test.ts index d48fe2f276b..c0f72d666e6 100644 --- a/src/tts/tts-config.test.ts +++ b/src/tts/tts-config.test.ts @@ -3,7 +3,7 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { afterAll, beforeAll, afterEach, beforeEach, describe, expect, it } from "vitest"; import type { OpenClawConfig } from "../config/config.js"; -import { shouldAttemptTtsPayload } from "./tts-config.js"; +import { resolveConfiguredTtsMode, shouldAttemptTtsPayload } from "./tts-config.js"; describe("shouldAttemptTtsPayload", () => { let originalPrefsPath: string | undefined; @@ -61,4 +61,31 @@ describe("shouldAttemptTtsPayload", () => { shouldAttemptTtsPayload({ cfg: { messages: { tts: { enabled: true } } } as OpenClawConfig }), ).toBe(false); }); + + it("uses per-agent TTS auto and mode overrides", () => { + const cfg = { + messages: { + tts: { + auto: "off", + mode: "final", + }, + }, + agents: { + list: [ + { + id: "voice", + tts: { + auto: "always", + mode: "all", + }, + }, + ], + }, + } as OpenClawConfig; + + expect(shouldAttemptTtsPayload({ cfg, agentId: "voice" })).toBe(true); + expect(resolveConfiguredTtsMode(cfg, "voice")).toBe("all"); + expect(shouldAttemptTtsPayload({ cfg, agentId: "main" })).toBe(false); + expect(resolveConfiguredTtsMode(cfg, "main")).toBe("final"); + }); }); diff --git a/src/tts/tts-config.ts b/src/tts/tts-config.ts index 3f235e34e4f..ea4e679e48d 100644 --- a/src/tts/tts-config.ts +++ b/src/tts/tts-config.ts @@ -1,13 +1,54 @@ import { existsSync, readFileSync } from "node:fs"; import path from "node:path"; import type { OpenClawConfig } from "../config/types.js"; -import type { TtsAutoMode, TtsMode } from "../config/types.tts.js"; +import type { TtsAutoMode, TtsConfig, TtsMode } from "../config/types.tts.js"; +import { normalizeAgentId } from "../routing/session-key.js"; import { resolveConfigDir, resolveUserPath } from "../utils.js"; import { normalizeTtsAutoMode } from "./tts-auto-mode.js"; export { normalizeTtsAutoMode } from "./tts-auto-mode.js"; -export function resolveConfiguredTtsMode(cfg: OpenClawConfig): TtsMode { - return cfg.messages?.tts?.mode ?? "final"; +const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]); + +function isPlainObject(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function deepMergeDefined(base: unknown, override: unknown): unknown { + if (!isPlainObject(base) || !isPlainObject(override)) { + return override === undefined ? base : override; + } + + const result: Record = { ...base }; + for (const [key, value] of Object.entries(override)) { + if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) { + continue; + } + const existing = result[key]; + result[key] = key in result ? deepMergeDefined(existing, value) : value; + } + return result; +} + +function resolveAgentTtsOverride( + cfg: OpenClawConfig, + agentId: string | undefined, +): TtsConfig | undefined { + if (!agentId || !Array.isArray(cfg.agents?.list)) { + return undefined; + } + const normalized = normalizeAgentId(agentId); + const agent = cfg.agents.list.find((entry) => normalizeAgentId(entry.id) === normalized); + return agent?.tts; +} + +export function resolveEffectiveTtsConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig { + const base = cfg.messages?.tts ?? {}; + const override = resolveAgentTtsOverride(cfg, agentId); + return deepMergeDefined(base, override ?? {}) as TtsConfig; +} + +export function resolveConfiguredTtsMode(cfg: OpenClawConfig, agentId?: string): TtsMode { + return resolveEffectiveTtsConfig(cfg, agentId).mode ?? "final"; } function resolveTtsPrefsPathValue(prefsPath: string | undefined): string { @@ -45,13 +86,14 @@ function readTtsPrefsAutoMode(prefsPath: string): TtsAutoMode | undefined { export function shouldAttemptTtsPayload(params: { cfg: OpenClawConfig; ttsAuto?: string; + agentId?: string; }): boolean { const sessionAuto = normalizeTtsAutoMode(params.ttsAuto); if (sessionAuto) { return sessionAuto !== "off"; } - const raw = params.cfg.messages?.tts; + const raw = resolveEffectiveTtsConfig(params.cfg, params.agentId); const prefsAuto = readTtsPrefsAutoMode(resolveTtsPrefsPathValue(raw?.prefsPath)); if (prefsAuto) { return prefsAuto !== "off";