From b3d9948c4c56435a168b16791f9c975c752ce7a4 Mon Sep 17 00:00:00 2001 From: Josh Avant <830519+joshavant@users.noreply.github.com> Date: Mon, 27 Apr 2026 01:02:17 -0500 Subject: [PATCH] fix: use runtime snapshot for TTS SecretRefs (#72581) * fix: use runtime snapshot for tts secrets * fix: keep tts secret snapshot selection local * docs: add tts secretref changelog entry --- CHANGELOG.md | 1 + extensions/speech-core/src/tts.test.ts | 64 +++++++++++- extensions/speech-core/src/tts.ts | 131 +++++++++++++++++-------- 3 files changed, 154 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77834e1c843..2889b597383 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai - macOS Gateway: detect installed-but-unloaded LaunchAgent split-brain states during status, doctor, and restart, and re-bootstrap launchd supervision before falling back to unmanaged listener restarts. Fixes #67335, #53475, and #71060; refs #58890, #60885, and #70801. Thanks @ze1tgeist88, @dafacto, and @vishutdhar. - Plugins/install: stage bundled plugin runtime dependencies before Gateway startup and drain update restarts while preserving per-plugin isolation when pre-stage scan or install fails. Thanks @codex. +- TTS/SecretRef: resolve `messages.tts.providers.*.apiKey` from the active runtime snapshot so SecretRef-backed MiniMax and other TTS provider keys work in runtime reply/audio paths. Fixes #68690. Thanks @joshavant. - CLI/startup: read generated startup metadata from the bundled `dist` layout before falling back to live help rendering, so root/browser help and channel-option bootstrap stay on the fast path. Thanks @vincentkoc. - CLI/help: treat positional `help` invocations like `openclaw channels help` as help paths for startup gating, avoiding model/auth warmup while preserving positional arguments such as `openclaw docs help`. Thanks @gumadeiras. - Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc. diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 1da0bc9f196..206b4232e5a 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -1,6 +1,10 @@ import { rmSync } from "node:fs"; import path from "node:path"; -import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; +import { + clearRuntimeConfigSnapshot, + setRuntimeConfigSnapshot, + type OpenClawConfig, +} from "openclaw/plugin-sdk/config-runtime"; import type { ReplyPayload } from "openclaw/plugin-sdk/reply-payload"; import type { SpeechProviderPlugin, @@ -163,6 +167,7 @@ async function expectTtsPayloadResult(params: { describe("speech-core native voice-note routing", () => { afterEach(() => { + clearRuntimeConfigSnapshot(); synthesizeMock.mockClear(); prepareSynthesisMock.mockClear(); installSpeechProviders([createMockSpeechProvider()]); @@ -214,6 +219,63 @@ describe("speech-core native voice-note routing", () => { }); }); + it("uses the active runtime snapshot when source config still contains TTS SecretRefs", async () => { + const sourceConfig = { + messages: { + tts: { + enabled: true, + provider: "mock", + providers: { + mock: { + apiKey: { source: "exec", provider: "mockexec", id: "minimax/tts/apiKey" }, + }, + }, + }, + }, + } as unknown as OpenClawConfig; + const runtimeConfig = { + messages: { + tts: { + enabled: true, + provider: "mock", + providers: { + mock: { + apiKey: "resolved-minimax-key", + }, + }, + }, + }, + } as unknown as OpenClawConfig; + installSpeechProviders([ + createMockSpeechProvider("mock", { + isConfigured: ({ providerConfig }) => providerConfig.apiKey === "resolved-minimax-key", + resolveConfig: ({ rawConfig }) => { + const providers = rawConfig.providers as Record | undefined; + return { + apiKey: providers?.mock?.apiKey, + }; + }, + }), + ]); + setRuntimeConfigSnapshot(runtimeConfig, sourceConfig); + + const result = await synthesizeSpeech({ + text: "Runtime snapshot TTS SecretRef", + cfg: sourceConfig, + disableFallback: true, + }); + + expect(result.success).toBe(true); + expect(synthesizeMock).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: runtimeConfig, + providerConfig: expect.objectContaining({ + apiKey: "resolved-minimax-key", + }), + }), + ); + }); + it.each(["feishu", "whatsapp"] as const)( "marks %s voice-note TTS for channel-side transcoding when provider returns mp3", async (channel) => { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 1dacf4ba8a5..a98c7ab7bab 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -10,13 +10,15 @@ import { } from "node:fs"; import path from "node:path"; import { resolveChannelTtsVoiceDelivery } from "openclaw/plugin-sdk/channel-targets"; -import type { - OpenClawConfig, - ResolvedTtsPersona, - TtsAutoMode, - TtsConfig, - TtsModelOverrideConfig, - TtsProvider, +import { + getRuntimeConfigSnapshot, + getRuntimeConfigSourceSnapshot, + type OpenClawConfig, + type ResolvedTtsPersona, + type TtsAutoMode, + type TtsConfig, + type TtsModelOverrideConfig, + type TtsProvider, } from "openclaw/plugin-sdk/config-runtime"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import { redactSensitiveText } from "openclaw/plugin-sdk/logging-core"; @@ -230,6 +232,43 @@ function _resolveRegistryDefaultSpeechProviderId(cfg?: OpenClawConfig): TtsProvi return sortSpeechProvidersForAutoSelection(cfg)[0]?.id ?? ""; } +function stableConfigStringify(value: unknown): string { + if (value === null || typeof value !== "object") { + return JSON.stringify(value) ?? "null"; + } + if (Array.isArray(value)) { + return `[${value.map((entry) => stableConfigStringify(entry)).join(",")}]`; + } + const record = value as Record; + return `{${Object.keys(record) + .toSorted() + .map((key) => `${JSON.stringify(key)}:${stableConfigStringify(record[key])}`) + .join(",")}}`; +} + +function configSnapshotsMatch(left: OpenClawConfig, right: OpenClawConfig): boolean { + if (left === right) { + return true; + } + try { + return stableConfigStringify(left) === stableConfigStringify(right); + } catch { + return false; + } +} + +function resolveTtsRuntimeConfig(cfg: OpenClawConfig): OpenClawConfig { + const runtimeConfig = getRuntimeConfigSnapshot(); + if (!runtimeConfig || cfg === runtimeConfig) { + return cfg; + } + const sourceConfig = getRuntimeConfigSourceSnapshot(); + if (!sourceConfig || configSnapshotsMatch(cfg, sourceConfig)) { + return runtimeConfig; + } + return cfg; +} + function asProviderConfig(value: unknown): SpeechProviderConfig { return typeof value === "object" && value !== null && !Array.isArray(value) ? (value as SpeechProviderConfig) @@ -343,7 +382,7 @@ function resolveLazyProviderConfig( const canonical = normalizeConfiguredSpeechProviderId(providerId) ?? normalizeLowercaseStringOrEmpty(providerId); const existing = config.providerConfigs[canonical]; - const effectiveCfg = cfg ?? config.sourceConfig; + const effectiveCfg = cfg ? resolveTtsRuntimeConfig(cfg) : config.sourceConfig; if (existing && !effectiveCfg) { return existing; } @@ -403,17 +442,19 @@ export function getResolvedSpeechProviderConfig( providerId: string, cfg?: OpenClawConfig, ): SpeechProviderConfig { + const effectiveCfg = cfg ? resolveTtsRuntimeConfig(cfg) : config.sourceConfig; const canonical = - canonicalizeSpeechProviderId(providerId, cfg) ?? + canonicalizeSpeechProviderId(providerId, effectiveCfg) ?? normalizeConfiguredSpeechProviderId(providerId) ?? normalizeLowercaseStringOrEmpty(providerId); - return resolveLazyProviderConfig(config, canonical, cfg); + return resolveLazyProviderConfig(config, canonical, effectiveCfg); } export function resolveTtsConfig( cfg: OpenClawConfig, contextOrAgentId?: string | TtsConfigResolutionContext, ): ResolvedTtsConfig { + cfg = resolveTtsRuntimeConfig(cfg); const raw: TtsConfig = resolveEffectiveTtsConfig(cfg, contextOrAgentId); const providerSource = raw.provider ? "config" : "default"; const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS; @@ -504,6 +545,7 @@ export function buildTtsSystemPromptHint( cfg: OpenClawConfig, agentId?: string, ): string | undefined { + cfg = resolveTtsRuntimeConfig(cfg); const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg, agentId }); if (autoMode === "off") { return undefined; @@ -667,17 +709,18 @@ export function resolveExplicitTtsOverrides(params: { channelId?: string; accountId?: string; }): TtsDirectiveOverrides { + const cfg = resolveTtsRuntimeConfig(params.cfg); const providerInput = params.provider?.trim(); const modelId = params.modelId?.trim(); const voiceId = params.voiceId?.trim(); - const config = resolveTtsConfig(params.cfg, { + const config = resolveTtsConfig(cfg, { agentId: params.agentId, channelId: params.channelId, accountId: params.accountId, }); const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); const selectedProvider = - canonicalizeSpeechProviderId(providerInput, params.cfg) ?? + canonicalizeSpeechProviderId(providerInput, cfg) ?? (modelId || voiceId ? getTtsProvider(config, prefsPath) : undefined); if (providerInput && !selectedProvider) { @@ -692,7 +735,7 @@ export function resolveExplicitTtsOverrides(params: { throw new Error("TTS model or voice overrides require a resolved provider."); } - const provider = getSpeechProvider(selectedProvider, params.cfg); + const provider = getSpeechProvider(selectedProvider, cfg); if (!provider) { throw new Error(`speech provider ${selectedProvider} is not registered`); } @@ -812,9 +855,10 @@ function shouldDeliverTtsAsVoice(params: { } export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { - const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary; + const effectiveCfg = cfg ? resolveTtsRuntimeConfig(cfg) : undefined; + const normalizedPrimary = canonicalizeSpeechProviderId(primary, effectiveCfg) ?? primary; const ordered = new Set([normalizedPrimary]); - for (const provider of sortSpeechProvidersForAutoSelection(cfg)) { + for (const provider of sortSpeechProvidersForAutoSelection(effectiveCfg)) { const normalized = provider.id; if (normalized !== normalizedPrimary) { ordered.add(normalized); @@ -828,14 +872,15 @@ export function isTtsProviderConfigured( provider: TtsProvider, cfg?: OpenClawConfig, ): boolean { - const resolvedProvider = getSpeechProvider(provider, cfg); + const effectiveCfg = cfg ? resolveTtsRuntimeConfig(cfg) : config.sourceConfig; + const resolvedProvider = getSpeechProvider(provider, effectiveCfg); if (!resolvedProvider) { return false; } return ( resolvedProvider.isConfigured({ - cfg, - providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, cfg), + cfg: effectiveCfg, + providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, effectiveCfg), timeoutMs: config.timeoutMs, }) ?? false ); @@ -1011,6 +1056,7 @@ function resolveTtsRequestSetup(params: { accountId?: string; }): | { + cfg: OpenClawConfig; config: ResolvedTtsConfig; persona?: ResolvedTtsPersona; providers: TtsProvider[]; @@ -1018,7 +1064,8 @@ function resolveTtsRequestSetup(params: { | { error: string; } { - const config = resolveTtsConfig(params.cfg, { + const cfg = resolveTtsRuntimeConfig(params.cfg); + const config = resolveTtsConfig(cfg, { agentId: params.agentId, channelId: params.channelId, accountId: params.accountId, @@ -1031,12 +1078,12 @@ function resolveTtsRequestSetup(params: { } const userProvider = getTtsProvider(config, prefsPath); - const provider = - canonicalizeSpeechProviderId(params.providerOverride, params.cfg) ?? userProvider; + const provider = canonicalizeSpeechProviderId(params.providerOverride, cfg) ?? userProvider; return { + cfg, config, persona: getTtsPersona(config, prefsPath), - providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg), + providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, cfg), }; } @@ -1116,7 +1163,7 @@ export async function synthesizeSpeech(params: { return { success: false, error: setup.error }; } - const { config, persona, providers } = setup; + const { cfg, config, persona, providers } = setup; const timeoutMs = params.timeoutMs ?? config.timeoutMs; const target = resolveTtsSynthesisTarget(params.channel); @@ -1134,7 +1181,7 @@ export async function synthesizeSpeech(params: { try { const resolvedProvider = resolveReadySpeechProvider({ provider, - cfg: params.cfg, + cfg, config, persona, }); @@ -1156,7 +1203,7 @@ export async function synthesizeSpeech(params: { const prepared = await prepareSpeechSynthesis({ provider: resolvedProvider.provider, text: params.text, - cfg: params.cfg, + cfg, providerConfig: resolvedProvider.providerConfig, providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id], persona: resolvedProvider.synthesisPersona, @@ -1166,7 +1213,7 @@ export async function synthesizeSpeech(params: { }); const synthesis = await resolvedProvider.provider.synthesize({ text: prepared.text, - cfg: params.cfg, + cfg, providerConfig: prepared.providerConfig, target, providerOverrides: prepared.providerOverrides, @@ -1243,7 +1290,7 @@ export async function textToSpeechTelephony(params: { return { success: false, error: setup.error }; } - const { config, persona, providers } = setup; + const { cfg, config, persona, providers } = setup; const errors: string[] = []; const attemptedProviders: string[] = []; const attempts: TtsProviderAttempt[] = []; @@ -1258,7 +1305,7 @@ export async function textToSpeechTelephony(params: { try { const resolvedProvider = resolveReadySpeechProvider({ provider, - cfg: params.cfg, + cfg, config, persona, requireTelephony: true, @@ -1284,7 +1331,7 @@ export async function textToSpeechTelephony(params: { const prepared = await prepareSpeechSynthesis({ provider: resolvedProvider.provider, text: params.text, - cfg: params.cfg, + cfg, providerConfig: resolvedProvider.providerConfig, persona: resolvedProvider.synthesisPersona, personaProviderConfig: resolvedProvider.personaProviderConfig, @@ -1293,7 +1340,7 @@ export async function textToSpeechTelephony(params: { }); const synthesis = await synthesizeTelephony({ text: prepared.text, - cfg: params.cfg, + cfg, providerConfig: prepared.providerConfig, timeoutMs: config.timeoutMs, }); @@ -1360,15 +1407,16 @@ export async function listSpeechVoices(params: { apiKey?: string; baseUrl?: string; }): Promise { - const provider = canonicalizeSpeechProviderId(params.provider, params.cfg); + const cfg = params.cfg ? resolveTtsRuntimeConfig(params.cfg) : undefined; + const provider = canonicalizeSpeechProviderId(params.provider, cfg); if (!provider) { throw new Error("speech provider id is required"); } - const config = params.config ?? (params.cfg ? resolveTtsConfig(params.cfg) : undefined); + const config = params.config ?? (cfg ? resolveTtsConfig(cfg) : undefined); if (!config) { throw new Error(`speech provider ${provider} requires cfg or resolved config`); } - const resolvedProvider = getSpeechProvider(provider, params.cfg); + const resolvedProvider = getSpeechProvider(provider, cfg); if (!resolvedProvider) { throw new Error(`speech provider ${provider} is not registered`); } @@ -1376,8 +1424,8 @@ export async function listSpeechVoices(params: { throw new Error(`speech provider ${provider} does not support voice listing`); } return await resolvedProvider.listVoices({ - cfg: params.cfg, - providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg), + cfg, + providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, cfg), apiKey: params.apiKey, baseUrl: params.baseUrl, }); @@ -1396,8 +1444,9 @@ export async function maybeApplyTtsToPayload(params: { if (params.payload.isCompactionNotice) { return params.payload; } + const cfg = resolveTtsRuntimeConfig(params.cfg); const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ - cfg: params.cfg, + cfg, sessionAuto: params.ttsAuto, agentId: params.agentId, channelId: params.channel, @@ -1406,7 +1455,7 @@ export async function maybeApplyTtsToPayload(params: { if (autoMode === "off") { return params.payload; } - const config = resolveTtsConfig(params.cfg, { + const config = resolveTtsConfig(cfg, { agentId: params.agentId, channelId: params.channel, accountId: params.accountId, @@ -1416,7 +1465,7 @@ export async function maybeApplyTtsToPayload(params: { const reply = resolveSendableOutboundReplyParts(params.payload); const text = reply.text; const directives = parseTtsDirectives(text, config.modelOverrides, { - cfg: params.cfg, + cfg, providerConfigs: config.providerConfigs, preferredProviderId: activeProvider, }); @@ -1426,7 +1475,7 @@ export async function maybeApplyTtsToPayload(params: { if (isVerbose()) { const effectiveProvider = directives.overrides?.provider - ? (canonicalizeSpeechProviderId(directives.overrides.provider, params.cfg) ?? activeProvider) + ? (canonicalizeSpeechProviderId(directives.overrides.provider, cfg) ?? activeProvider) : activeProvider; logVerbose( `TTS: auto mode enabled (${autoMode}), channel=${params.channel}, selected provider=${effectiveProvider}, config.provider=${config.provider}, config.providerSource=${config.providerSource}`, @@ -1486,7 +1535,7 @@ export async function maybeApplyTtsToPayload(params: { const summary = await summarizeText({ text: textForAudio, targetLength: maxLength, - cfg: params.cfg, + cfg, config, timeoutMs: config.timeoutMs, }); @@ -1514,7 +1563,7 @@ export async function maybeApplyTtsToPayload(params: { const ttsStart = Date.now(); const result = await textToSpeech({ text: textForAudio, - cfg: params.cfg, + cfg, prefsPath, channel: params.channel, overrides: directives.overrides,