From f3d5c5488472ffcbaba4f94664ce1a772083b83b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 1 May 2026 11:37:59 +0100 Subject: [PATCH] fix: keep configured media STT providers registered --- CHANGELOG.md | 1 + .../capability-provider-runtime.test.ts | 77 +++++++++++++++++++ src/plugins/capability-provider-runtime.ts | 66 +++++++++++----- 3 files changed, 126 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89a27186832..2ef1e404d8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Discord/voice: merge configured media-understanding providers such as Deepgram into partial active provider registries, so follow-up voice turns keep transcribing after another media plugin is already active. Fixes #65687. Thanks @OneMintJulep. - Discord/voice: apply per-channel Discord `systemPrompt` overrides to voice transcript turns by forwarding the trusted channel prompt through the voice agent run. Fixes #47095. Thanks @qearlyao. - Discord/voice: run voice-channel turns under a voice-output policy that hides the agent `tts` tool and asks for spoken reply text, so `/vc join` sessions synthesize and play agent replies instead of ending with `NO_REPLY`. Fixes #61536. Thanks @aounakram. - Plugins/runtime-deps: prune inactive same-package versioned runtime-deps roots after bundled dependency repair, so upgrades do not leave old `openclaw--` package caches behind after doctor runs. Thanks @vincentkoc. diff --git a/src/plugins/capability-provider-runtime.test.ts b/src/plugins/capability-provider-runtime.test.ts index 9bb540db323..d8768c57f34 100644 --- a/src/plugins/capability-provider-runtime.test.ts +++ b/src/plugins/capability-provider-runtime.test.ts @@ -265,6 +265,83 @@ describe("resolvePluginCapabilityProviders", () => { expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith(); }); + it("merges configured media-understanding providers missing from the active registry", () => { + const active = createEmptyPluginRegistry(); + active.mediaUnderstandingProviders.push({ + pluginId: "openai", + pluginName: "OpenAI", + source: "test", + provider: { + id: "openai", + capabilities: ["image"], + }, + } as never); + const loaded = createEmptyPluginRegistry(); + loaded.mediaUnderstandingProviders.push( + { + pluginId: "deepgram", + pluginName: "Deepgram", + source: "test", + provider: { + id: "deepgram", + capabilities: ["audio"], + }, + } as never, + { + pluginId: "google", + pluginName: "Google", + source: "test", + provider: { + id: "google", + capabilities: ["image", "audio", "video"], + }, + } as never, + ); + mocks.loadPluginManifestRegistry.mockReturnValue({ + plugins: [ + { + id: "deepgram", + origin: "bundled", + contracts: { mediaUnderstandingProviders: ["deepgram"] }, + }, + { + id: "google", + origin: "bundled", + contracts: { mediaUnderstandingProviders: ["google"] }, + }, + ] as never, + diagnostics: [], + }); + mocks.resolveRuntimePluginRegistry.mockImplementation((params?: unknown) => + params === undefined ? active : loaded, + ); + + const providers = resolvePluginCapabilityProviders({ + key: "mediaUnderstandingProviders", + cfg: { + plugins: { allow: ["openai", "deepgram", "google"] }, + tools: { + media: { + audio: { enabled: true, models: [{ provider: "deepgram", model: "nova-3" }] }, + }, + }, + } as OpenClawConfig, + }); + + expectResolvedCapabilityProviderIds(providers, ["openai", "deepgram"]); + expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith(); + expect(mocks.resolveRuntimePluginRegistry).toHaveBeenCalledWith({ + config: expect.objectContaining({ + plugins: expect.objectContaining({ + allow: ["openai", "deepgram", "google"], + }), + }), + onlyPluginIds: ["deepgram", "google"], + activate: false, + installBundledRuntimeDeps: false, + }); + }); + it("keeps active speech providers when cfg requests an active provider alias", () => { const active = createEmptyPluginRegistry(); active.speechProviders.push({ diff --git a/src/plugins/capability-provider-runtime.ts b/src/plugins/capability-provider-runtime.ts index 881cd9745cd..07a63d4e5a5 100644 --- a/src/plugins/capability-provider-runtime.ts +++ b/src/plugins/capability-provider-runtime.ts @@ -241,6 +241,43 @@ function collectRequestedSpeechProviderIds(cfg: OpenClawConfig | undefined): Set return requested; } +function addMediaModelProviders(target: Set, value: unknown): void { + if (!Array.isArray(value)) { + return; + } + for (const entry of value) { + if (typeof entry === "object" && entry !== null) { + addStringValue(target, (entry as { provider?: unknown }).provider); + } + } +} + +function collectRequestedMediaUnderstandingProviderIds( + cfg: OpenClawConfig | undefined, +): Set { + const requested = new Set(); + const media = cfg?.tools?.media; + addMediaModelProviders(requested, media?.models); + addMediaModelProviders(requested, media?.image?.models); + addMediaModelProviders(requested, media?.audio?.models); + addMediaModelProviders(requested, media?.video?.models); + return requested; +} + +function collectRequestedCapabilityProviderIds(params: { + key: CapabilityProviderRegistryKey; + cfg?: OpenClawConfig; +}): Set | undefined { + switch (params.key) { + case "speechProviders": + return collectRequestedSpeechProviderIds(params.cfg); + case "mediaUnderstandingProviders": + return collectRequestedMediaUnderstandingProviderIds(params.cfg); + default: + return undefined; + } +} + function removeActiveProviderIds(requested: Set, entries: readonly unknown[]): void { for (const entry of entries as Array<{ provider: { id?: unknown; aliases?: unknown } }>) { const provider = entry.provider as { id?: unknown; aliases?: unknown }; @@ -262,7 +299,7 @@ function filterLoadedProvidersForRequestedConfig; entries: PluginRegistry[K]; }): PluginRegistry[K] { - if (params.key !== "speechProviders") { + if (params.key !== "speechProviders" && params.key !== "mediaUnderstandingProviders") { return [] as unknown as PluginRegistry[K]; } if (params.requested.size === 0) { @@ -341,23 +378,16 @@ export function resolvePluginCapabilityProviders 0 && - params.key !== "memoryEmbeddingProviders" && - params.key !== "speechProviders" - ) { - return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; - } - if (activeProviders.length > 0 && params.key === "speechProviders" && !params.cfg) { - return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; - } - const missingRequestedSpeechProviders = - activeProviders.length > 0 && params.key === "speechProviders" - ? collectRequestedSpeechProviderIds(params.cfg) + const missingRequestedProviders = + activeProviders.length > 0 + ? collectRequestedCapabilityProviderIds({ key: params.key, cfg: params.cfg }) : undefined; - if (missingRequestedSpeechProviders) { - removeActiveProviderIds(missingRequestedSpeechProviders, activeProviders); - if (missingRequestedSpeechProviders.size === 0) { + if (activeProviders.length > 0 && params.key !== "memoryEmbeddingProviders") { + if (!missingRequestedProviders) { + return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; + } + removeActiveProviderIds(missingRequestedProviders, activeProviders); + if (missingRequestedProviders.size === 0) { return activeProviders.map((entry) => entry.provider) as CapabilityProviderForKey[]; } } @@ -390,7 +420,7 @@ export function resolvePluginCapabilityProviders 0 ? filterLoadedProvidersForRequestedConfig({ key: params.key, - requested: missingRequestedSpeechProviders ?? new Set(), + requested: missingRequestedProviders ?? new Set(), entries: loadedProviders, }) : loadedProviders;