From c11730fd09fee52c44d271cb7d99735a9ae93072 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 05:37:44 +0100 Subject: [PATCH] fix(tts): keep speech fallback discovery scoped --- .../provider-registry.allowlist.test.ts | 1 + .../provider-registry.allowlist.test.ts | 1 + .../capability-provider-runtime.test.ts | 132 ++++++++- src/plugins/capability-provider-runtime.ts | 105 ++++++- src/tts/provider-registry.test.ts | 1 + test/helpers/plugins/tts-contract-suites.ts | 275 ++++++++++-------- 6 files changed, 386 insertions(+), 129 deletions(-) diff --git a/src/image-generation/provider-registry.allowlist.test.ts b/src/image-generation/provider-registry.allowlist.test.ts index 2466f39137b..d444cb8d07c 100644 --- a/src/image-generation/provider-registry.allowlist.test.ts +++ b/src/image-generation/provider-registry.allowlist.test.ts @@ -26,6 +26,7 @@ describe("image-generation provider registry allowlist fallback", () => { expect(getImageGenerationProvider("openai", cfg as OpenClawConfig)).toBeUndefined(); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: compatConfig, + activate: false, }); }); }); diff --git a/src/media-understanding/provider-registry.allowlist.test.ts b/src/media-understanding/provider-registry.allowlist.test.ts index ca315b0bbad..3238b572f88 100644 --- a/src/media-understanding/provider-registry.allowlist.test.ts +++ b/src/media-understanding/provider-registry.allowlist.test.ts @@ -27,6 +27,7 @@ describe("media-understanding provider registry allowlist fallback", () => { expect(getMediaUnderstandingProvider("openai", registry)).toBeUndefined(); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: compatConfig, + activate: false, }); }); }); diff --git a/src/plugins/capability-provider-runtime.test.ts b/src/plugins/capability-provider-runtime.test.ts index 058b8f62366..1ee6c7f6100 100644 --- a/src/plugins/capability-provider-runtime.test.ts +++ b/src/plugins/capability-provider-runtime.test.ts @@ -82,6 +82,7 @@ function expectBundledCompatLoadPath(params: { }); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: params.enablementCompat, + activate: false, }); } @@ -203,7 +204,36 @@ describe("resolvePluginCapabilityProviders", () => { expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith(); }); - it("keeps active capability providers when cfg compat has no extra providers", () => { + it("uses active non-speech capability providers even when cfg is passed", () => { + const active = createEmptyPluginRegistry(); + active.mediaUnderstandingProviders.push({ + pluginId: "deepgram", + pluginName: "Deepgram", + source: "test", + provider: { + id: "deepgram", + capabilities: ["audio"], + }, + } as never); + mocks.resolveRuntimePluginRegistry.mockReturnValue(active); + + const providers = resolvePluginCapabilityProviders({ + key: "mediaUnderstandingProviders", + cfg: { + tools: { + media: { + models: [{ provider: "deepgram" }], + }, + }, + } as OpenClawConfig, + }); + + expectResolvedCapabilityProviderIds(providers, ["deepgram"]); + expect(mocks.loadPluginManifestRegistry).not.toHaveBeenCalled(); + expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith(); + }); + + it("keeps active speech providers when cfg requests an active provider alias", () => { const active = createEmptyPluginRegistry(); active.speechProviders.push({ pluginId: "microsoft", @@ -222,9 +252,7 @@ describe("resolvePluginCapabilityProviders", () => { }), }, } as never); - mocks.resolveRuntimePluginRegistry.mockImplementation((params?: unknown) => - params === undefined ? active : createEmptyPluginRegistry(), - ); + mocks.resolveRuntimePluginRegistry.mockReturnValue(active); const providers = resolvePluginCapabilityProviders({ key: "speechProviders", @@ -235,10 +263,8 @@ describe("resolvePluginCapabilityProviders", () => { }); expectResolvedCapabilityProviderIds(providers, ["microsoft"]); + expect(mocks.loadPluginManifestRegistry).not.toHaveBeenCalled(); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith(); - expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ - config: expect.anything(), - }); }); it("keeps active capability providers when cfg has no explicit plugin config", () => { @@ -349,9 +375,94 @@ describe("resolvePluginCapabilityProviders", () => { allow: ["openai", "microsoft"], }), }), + activate: false, }); }); + it("does not merge unrelated bundled capability providers when cfg requests one provider", () => { + const active = createEmptyPluginRegistry(); + active.speechProviders.push({ + pluginId: "openai", + pluginName: "openai", + source: "test", + provider: { + id: "openai", + label: "openai", + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: Buffer.from("x"), + outputFormat: "mp3", + voiceCompatible: false, + fileExtension: ".mp3", + }), + }, + } as never); + const loaded = createEmptyPluginRegistry(); + loaded.speechProviders.push( + { + pluginId: "microsoft", + pluginName: "microsoft", + source: "test", + provider: { + id: "microsoft", + label: "microsoft", + aliases: ["edge"], + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: Buffer.from("x"), + outputFormat: "mp3", + voiceCompatible: false, + fileExtension: ".mp3", + }), + }, + } as never, + { + pluginId: "elevenlabs", + pluginName: "elevenlabs", + source: "test", + provider: { + id: "elevenlabs", + label: "elevenlabs", + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: Buffer.from("x"), + outputFormat: "mp3", + voiceCompatible: false, + fileExtension: ".mp3", + }), + }, + } as never, + ); + mocks.loadPluginManifestRegistry.mockReturnValue({ + plugins: [ + { + id: "microsoft", + origin: "bundled", + contracts: { speechProviders: ["microsoft"] }, + }, + { + id: "elevenlabs", + origin: "bundled", + contracts: { speechProviders: ["elevenlabs"] }, + }, + ] as never, + diagnostics: [], + }); + mocks.resolveRuntimePluginRegistry.mockImplementation((params?: unknown) => + params === undefined ? active : loaded, + ); + + const providers = resolvePluginCapabilityProviders({ + key: "speechProviders", + cfg: { + plugins: { allow: ["openai", "microsoft", "elevenlabs"] }, + messages: { tts: { provider: "edge" } }, + } as OpenClawConfig, + }); + + expectResolvedCapabilityProviderIds(providers, ["openai", "microsoft"]); + }); + it.each([ ["memoryEmbeddingProviders", "memoryEmbeddingProviders"], ["speechProviders", "speechProviders"], @@ -384,6 +495,7 @@ describe("resolvePluginCapabilityProviders", () => { expectNoResolvedCapabilityProviders(providers); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: expect.anything(), + activate: false, }); }); @@ -424,7 +536,10 @@ describe("resolvePluginCapabilityProviders", () => { config: undefined, env: process.env, }); - expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: compatConfig }); + expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ + config: compatConfig, + activate: false, + }); }); it("loads only the bundled owner plugin for a targeted provider lookup", () => { @@ -488,6 +603,7 @@ describe("resolvePluginCapabilityProviders", () => { }); expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ config: enablementCompat, + activate: false, }); }); }); diff --git a/src/plugins/capability-provider-runtime.ts b/src/plugins/capability-provider-runtime.ts index 56773ca120c..599a6f60024 100644 --- a/src/plugins/capability-provider-runtime.ts +++ b/src/plugins/capability-provider-runtime.ts @@ -123,6 +123,81 @@ function mergeCapabilityProviders( return [...merged.values(), ...unnamed]; } +function addObjectKeys(target: Set, value: unknown): void { + if (typeof value !== "object" || value === null || Array.isArray(value)) { + return; + } + for (const key of Object.keys(value)) { + const normalized = key.trim().toLowerCase(); + if (normalized) { + target.add(normalized); + } + } +} + +function addStringValue(target: Set, value: unknown): void { + if (typeof value !== "string") { + return; + } + const normalized = value.trim().toLowerCase(); + if (normalized) { + target.add(normalized); + } +} + +function collectRequestedSpeechProviderIds(cfg: OpenClawConfig | undefined): Set { + const requested = new Set(); + const tts = + typeof cfg?.messages?.tts === "object" && cfg.messages.tts !== null + ? (cfg.messages.tts as Record) + : undefined; + addStringValue(requested, tts?.provider); + addObjectKeys(requested, tts?.providers); + addObjectKeys(requested, cfg?.models?.providers); + return requested; +} + +function removeActiveProviderIds(requested: Set, entries: readonly unknown[]): void { + for (const entry of entries as Array<{ provider: { id?: unknown; aliases?: unknown } }>) { + const provider = entry.provider as { id?: unknown; aliases?: unknown }; + if (typeof provider.id === "string") { + requested.delete(provider.id.toLowerCase()); + } + if (Array.isArray(provider.aliases)) { + for (const alias of provider.aliases) { + if (typeof alias === "string") { + requested.delete(alias.toLowerCase()); + } + } + } + } +} + +function filterLoadedProvidersForRequestedConfig(params: { + key: K; + requested: Set; + entries: PluginRegistry[K]; +}): PluginRegistry[K] { + if (params.key !== "speechProviders") { + return [] as unknown as PluginRegistry[K]; + } + if (params.requested.size === 0) { + return [] as unknown as PluginRegistry[K]; + } + return params.entries.filter((entry) => { + const provider = entry.provider as { id?: unknown; aliases?: unknown }; + if (typeof provider.id === "string" && params.requested.has(provider.id.toLowerCase())) { + return true; + } + if (Array.isArray(provider.aliases)) { + return provider.aliases.some( + (alias) => typeof alias === "string" && params.requested.has(alias.toLowerCase()), + ); + } + return false; + }) as PluginRegistry[K]; +} + export function resolvePluginCapabilityProvider(params: { key: K; providerId: string; @@ -148,7 +223,8 @@ export function resolvePluginCapabilityProvider 0 && params.key !== "memoryEmbeddingProviders" && + params.key !== "speechProviders" && !hasExplicitPluginConfig(params.cfg?.plugins) ) { return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; } + if (activeProviders.length > 0 && params.key === "speechProviders" && !params.cfg) { + return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; + } + const missingRequestedSpeechProviders = + activeProviders.length > 0 && params.key === "speechProviders" + ? collectRequestedSpeechProviderIds(params.cfg) + : undefined; + if (missingRequestedSpeechProviders) { + removeActiveProviderIds(missingRequestedSpeechProviders, activeProviders); + if (missingRequestedSpeechProviders.size === 0) { + return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; + } + } const compatConfig = resolveCapabilityProviderConfig({ key: params.key, cfg: params.cfg }); - const loadOptions = compatConfig === undefined ? undefined : { config: compatConfig }; + const loadOptions = + compatConfig === undefined ? undefined : { config: compatConfig, activate: false }; const registry = resolveRuntimePluginRegistry(loadOptions); const loadedProviders = registry?.[params.key] ?? []; if (params.key !== "memoryEmbeddingProviders") { - return mergeCapabilityProviders(activeProviders, loadedProviders); + const mergeLoadedProviders = + activeProviders.length > 0 + ? filterLoadedProvidersForRequestedConfig({ + key: params.key, + requested: missingRequestedSpeechProviders ?? new Set(), + entries: loadedProviders, + }) + : loadedProviders; + return mergeCapabilityProviders(activeProviders, mergeLoadedProviders); } return mergeCapabilityProviders(activeProviders, loadedProviders); } diff --git a/src/tts/provider-registry.test.ts b/src/tts/provider-registry.test.ts index f8e122cb5b6..3ccba8c5760 100644 --- a/src/tts/provider-registry.test.ts +++ b/src/tts/provider-registry.test.ts @@ -123,6 +123,7 @@ describe("speech provider registry", () => { }, }, }, + activate: false, }); }); diff --git a/test/helpers/plugins/tts-contract-suites.ts b/test/helpers/plugins/tts-contract-suites.ts index 35990fccda1..342791fb29e 100644 --- a/test/helpers/plugins/tts-contract-suites.ts +++ b/test/helpers/plugins/tts-contract-suites.ts @@ -5,7 +5,7 @@ import { createEmptyPluginRegistry } from "../../../src/plugins/registry-empty.j import { setActivePluginRegistry } from "../../../src/plugins/runtime.js"; import type { SpeechProviderPlugin } from "../../../src/plugins/types.js"; import { resolveWorkspacePackagePublicModuleUrl } from "../../../src/test-utils/bundled-plugin-public-surface.js"; -import { withEnv } from "../../../src/test-utils/env.js"; +import { withEnv, withEnvAsync } from "../../../src/test-utils/env.js"; import type { ResolvedTtsConfig } from "../../../src/tts/tts-types.js"; type TtsRuntimeModule = typeof import("../../../src/tts/tts.js"); @@ -36,6 +36,41 @@ let getResolvedSpeechProviderConfig: TtsRuntimeModule["_test"]["getResolvedSpeec let formatTtsProviderError: TtsRuntimeModule["_test"]["formatTtsProviderError"]; let sanitizeTtsErrorForLog: TtsRuntimeModule["_test"]["sanitizeTtsErrorForLog"]; +const SPEECH_PROVIDER_ENV_KEYS = [ + "ELEVENLABS_API_KEY", + "GEMINI_API_KEY", + "GOOGLE_API_KEY", + "GRADIUM_API_KEY", + "MINIMAX_API_KEY", + "OPENAI_API_KEY", + "VYDRA_API_KEY", + "XAI_API_KEY", + "XI_API_KEY", +] as const; + +function isolatedSpeechProviderEnv( + overrides: Record = {}, +): Record { + return { + ...Object.fromEntries(SPEECH_PROVIDER_ENV_KEYS.map((key) => [key, undefined])), + ...overrides, + }; +} + +function withIsolatedSpeechProviderEnv( + overrides: Record, + fn: () => T, +): T { + return withEnv(isolatedSpeechProviderEnv(overrides), fn); +} + +async function withIsolatedSpeechProviderEnvAsync( + overrides: Record, + fn: () => Promise, +): Promise { + return await withEnvAsync(isolatedSpeechProviderEnv(overrides), fn); +} + vi.mock("@mariozechner/pi-ai", () => { const getApiProvider = vi.fn(() => undefined); return { @@ -670,7 +705,7 @@ export function describeTtsConfigContract() { expected: "microsoft", }, ] as const)("selects provider based on available API keys: $name", (testCase) => { - withEnv(testCase.env, () => { + withIsolatedSpeechProviderEnv(testCase.env, () => { const config = { auto: "off", mode: "final", @@ -693,7 +728,7 @@ export function describeTtsConfigContract() { }); it("passes cfg into auto-selection so model-provider Google keys can configure TTS", () => { - withEnv( + withIsolatedSpeechProviderEnv( { OPENAI_API_KEY: undefined, ELEVENLABS_API_KEY: undefined, @@ -974,133 +1009,137 @@ export function describeTtsProviderRuntimeContract() { describe("fallback readiness errors", () => { it("continues synthesize fallback when primary readiness checks throw", async () => { - const throwingPrimary: SpeechProviderPlugin = { - id: "openai", - label: "OpenAI", - autoSelectOrder: 10, - resolveConfig: () => ({}), - isConfigured: () => { - throw new Error("Authorization: Bearer sk-readiness-throw-token-1234567890\nboom"); - }, - synthesize: async () => { - throw new Error("unexpected synthesize call"); - }, - }; - const fallback: SpeechProviderPlugin = { - id: "microsoft", - label: "Microsoft", - autoSelectOrder: 20, - resolveConfig: () => ({}), - isConfigured: () => true, - synthesize: async () => ({ - audioBuffer: createAudioBuffer(2), - outputFormat: "mp3", - fileExtension: ".mp3", - voiceCompatible: true, - }), - }; - const registry = createEmptyPluginRegistry(); - registry.speechProviders = [ - { pluginId: "openai", provider: throwingPrimary, source: "test" }, - { pluginId: "microsoft", provider: fallback, source: "test" }, - ]; - setActivePluginRegistry(registry); + await withIsolatedSpeechProviderEnvAsync({}, async () => { + const throwingPrimary: SpeechProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 10, + resolveConfig: () => ({}), + isConfigured: () => { + throw new Error("Authorization: Bearer sk-readiness-throw-token-1234567890\nboom"); + }, + synthesize: async () => { + throw new Error("unexpected synthesize call"); + }, + }; + const fallback: SpeechProviderPlugin = { + id: "microsoft", + label: "Microsoft", + autoSelectOrder: 20, + resolveConfig: () => ({}), + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: true, + }), + }; + const registry = createEmptyPluginRegistry(); + registry.speechProviders = [ + { pluginId: "openai", provider: throwingPrimary, source: "test" }, + { pluginId: "microsoft", provider: fallback, source: "test" }, + ]; + setActivePluginRegistry(registry); - const result = await ttsRuntime.synthesizeSpeech({ - text: "hello fallback", - cfg: { - messages: { - tts: { - provider: "openai", + const result = await ttsRuntime.synthesizeSpeech({ + text: "hello fallback", + cfg: { + messages: { + tts: { + provider: "openai", + }, }, }, - }, - }); + }); - expect(result.success).toBe(true); - if (!result.success) { - throw new Error("expected fallback synthesis success"); - } - expect(result.provider).toBe("microsoft"); - expect(result.fallbackFrom).toBe("openai"); - expect(result.attemptedProviders).toEqual(["openai", "microsoft"]); - expect(result.attempts?.[0]).toMatchObject({ - provider: "openai", - outcome: "failed", - reasonCode: "provider_error", - }); - expect(result.attempts?.[1]).toMatchObject({ - provider: "microsoft", - outcome: "success", - reasonCode: "success", + expect(result.success).toBe(true); + if (!result.success) { + throw new Error("expected fallback synthesis success"); + } + expect(result.provider).toBe("microsoft"); + expect(result.fallbackFrom).toBe("openai"); + expect(result.attemptedProviders).toEqual(["openai", "microsoft"]); + expect(result.attempts?.[0]).toMatchObject({ + provider: "openai", + outcome: "failed", + reasonCode: "provider_error", + }); + expect(result.attempts?.[1]).toMatchObject({ + provider: "microsoft", + outcome: "success", + reasonCode: "success", + }); }); }); it("continues telephony fallback when primary readiness checks throw", async () => { - const throwingPrimary: SpeechProviderPlugin = { - id: "primary-throws", - label: "PrimaryThrows", - autoSelectOrder: 10, - resolveConfig: () => ({}), - isConfigured: () => { - throw new Error("Authorization: Bearer sk-telephony-throw-token-1234567890\tboom"); - }, - synthesize: async () => { - throw new Error("unexpected synthesize call"); - }, - }; - const fallback: SpeechProviderPlugin = { - id: "microsoft", - label: "Microsoft", - autoSelectOrder: 20, - resolveConfig: () => ({}), - isConfigured: () => true, - synthesize: async () => ({ - audioBuffer: createAudioBuffer(2), - outputFormat: "mp3", - fileExtension: ".mp3", - voiceCompatible: true, - }), - synthesizeTelephony: async () => ({ - audioBuffer: createAudioBuffer(2), - outputFormat: "mp3", - sampleRate: 24000, - }), - }; - const registry = createEmptyPluginRegistry(); - registry.speechProviders = [ - { pluginId: "primary-throws", provider: throwingPrimary, source: "test" }, - { pluginId: "microsoft", provider: fallback, source: "test" }, - ]; - setActivePluginRegistry(registry); + await withIsolatedSpeechProviderEnvAsync({}, async () => { + const throwingPrimary: SpeechProviderPlugin = { + id: "primary-throws", + label: "PrimaryThrows", + autoSelectOrder: 10, + resolveConfig: () => ({}), + isConfigured: () => { + throw new Error("Authorization: Bearer sk-telephony-throw-token-1234567890\tboom"); + }, + synthesize: async () => { + throw new Error("unexpected synthesize call"); + }, + }; + const fallback: SpeechProviderPlugin = { + id: "microsoft", + label: "Microsoft", + autoSelectOrder: 20, + resolveConfig: () => ({}), + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: true, + }), + synthesizeTelephony: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + sampleRate: 24000, + }), + }; + const registry = createEmptyPluginRegistry(); + registry.speechProviders = [ + { pluginId: "primary-throws", provider: throwingPrimary, source: "test" }, + { pluginId: "microsoft", provider: fallback, source: "test" }, + ]; + setActivePluginRegistry(registry); - const result = await ttsRuntime.textToSpeechTelephony({ - text: "hello telephony fallback", - cfg: { - messages: { - tts: { - provider: "primary-throws", + const result = await ttsRuntime.textToSpeechTelephony({ + text: "hello telephony fallback", + cfg: { + messages: { + tts: { + provider: "primary-throws", + }, }, }, - }, - }); + }); - expect(result.success).toBe(true); - if (!result.success) { - throw new Error("expected telephony fallback success"); - } - expect(result.provider).toBe("microsoft"); - expect(result.fallbackFrom).toBe("primary-throws"); - expect(result.attemptedProviders).toEqual(["primary-throws", "microsoft"]); - expect(result.attempts?.[0]).toMatchObject({ - provider: "primary-throws", - outcome: "failed", - reasonCode: "provider_error", - }); - expect(result.attempts?.[1]).toMatchObject({ - provider: "microsoft", - outcome: "success", - reasonCode: "success", + expect(result.success).toBe(true); + if (!result.success) { + throw new Error("expected telephony fallback success"); + } + expect(result.provider).toBe("microsoft"); + expect(result.fallbackFrom).toBe("primary-throws"); + expect(result.attemptedProviders).toEqual(["primary-throws", "microsoft"]); + expect(result.attempts?.[0]).toMatchObject({ + provider: "primary-throws", + outcome: "failed", + reasonCode: "provider_error", + }); + expect(result.attempts?.[1]).toMatchObject({ + provider: "microsoft", + outcome: "success", + reasonCode: "success", + }); }); });