From 3f32aa7582995546d0b81abc201ca38f5720a4b5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 12 Apr 2026 12:00:42 +0100 Subject: [PATCH] fix(media): decouple capability registry from runtime loaders --- src/media-understanding/entry-capabilities.ts | 12 ++-- .../provider-capability-registry.ts | 61 +++++++++++++++++++ src/media-understanding/types.ts | 7 +++ src/secrets/runtime-config-collectors-core.ts | 6 +- 4 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 src/media-understanding/provider-capability-registry.ts diff --git a/src/media-understanding/entry-capabilities.ts b/src/media-understanding/entry-capabilities.ts index d59ccb15a5f..6349b5568d1 100644 --- a/src/media-understanding/entry-capabilities.ts +++ b/src/media-understanding/entry-capabilities.ts @@ -1,13 +1,9 @@ import type { MediaUnderstandingModelConfig } from "../config/types.tools.js"; import { normalizeMediaProviderId } from "./provider-id.js"; -import type { MediaUnderstandingCapability } from "./types.js"; - -export type MediaUnderstandingCapabilityRegistry = Map< - string, - { - capabilities?: MediaUnderstandingCapability[]; - } ->; +import type { + MediaUnderstandingCapability, + MediaUnderstandingCapabilityRegistry, +} from "./types.js"; const MEDIA_CAPABILITIES = ["audio", "image", "video"] as const; diff --git a/src/media-understanding/provider-capability-registry.ts b/src/media-understanding/provider-capability-registry.ts new file mode 100644 index 00000000000..2206d112998 --- /dev/null +++ b/src/media-understanding/provider-capability-registry.ts @@ -0,0 +1,61 @@ +import type { OpenClawConfig } from "../config/types.js"; +import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js"; +import { normalizeMediaProviderId } from "./provider-id.js"; +import type { MediaUnderstandingCapabilityRegistry, MediaUnderstandingProvider } from "./types.js"; + +type ConfigProvider = NonNullable< + NonNullable["providers"]>[string] +>; + +type ConfigProviderModel = NonNullable[number]; + +function mergeProviderCapabilities( + registry: MediaUnderstandingCapabilityRegistry, + provider: Pick, +) { + const normalizedKey = normalizeMediaProviderId(provider.id); + const existing = registry.get(normalizedKey); + registry.set(normalizedKey, { + capabilities: provider.capabilities ?? existing?.capabilities, + }); +} + +export function buildMediaUnderstandingCapabilityRegistry( + cfg?: OpenClawConfig, +): MediaUnderstandingCapabilityRegistry { + const registry: MediaUnderstandingCapabilityRegistry = new Map(); + + for (const provider of resolvePluginCapabilityProviders({ + key: "mediaUnderstandingProviders", + cfg, + })) { + mergeProviderCapabilities(registry, provider); + } + + const configProviders = cfg?.models?.providers; + if (configProviders && typeof configProviders === "object") { + for (const [providerKey, providerCfg] of Object.entries(configProviders)) { + if (!providerKey?.trim()) { + continue; + } + const normalizedKey = normalizeMediaProviderId(providerKey); + if (registry.has(normalizedKey)) { + continue; + } + const models = providerCfg.models ?? []; + const hasImageModel = models.some( + (model: ConfigProviderModel) => + Array.isArray(model?.input) && model.input.includes("image"), + ); + if (!hasImageModel) { + continue; + } + mergeProviderCapabilities(registry, { + id: normalizedKey, + capabilities: ["image"], + }); + } + } + + return registry; +} diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 5f8e5a06d6b..052b8569b08 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -7,6 +7,13 @@ export type MediaUnderstandingKind = export type MediaUnderstandingCapability = "image" | "audio" | "video"; +export type MediaUnderstandingCapabilityRegistry = Map< + string, + { + capabilities?: MediaUnderstandingCapability[]; + } +>; + export type MediaAttachment = { path?: string; url?: string; diff --git a/src/secrets/runtime-config-collectors-core.ts b/src/secrets/runtime-config-collectors-core.ts index 6ea3655a6bd..8ce5183004d 100644 --- a/src/secrets/runtime-config-collectors-core.ts +++ b/src/secrets/runtime-config-collectors-core.ts @@ -4,7 +4,7 @@ import { resolveConfiguredMediaEntryCapabilities, resolveEffectiveMediaEntryCapabilities, } from "../media-understanding/entry-capabilities.js"; -import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js"; +import { buildMediaUnderstandingCapabilityRegistry } from "../media-understanding/provider-capability-registry.js"; import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; import { collectTtsApiKeyAssignments } from "./runtime-config-collectors-tts.js"; import { evaluateGatewayAuthSurfaceStates } from "./runtime-gateway-auth-surfaces.js"; @@ -401,9 +401,9 @@ function collectMediaRequestAssignments(params: { return; } - let providerRegistry: ReturnType | undefined; + let providerRegistry: ReturnType | undefined; const getProviderRegistry = () => { - providerRegistry ??= buildMediaUnderstandingRegistry(undefined, params.config); + providerRegistry ??= buildMediaUnderstandingCapabilityRegistry(params.config); return providerRegistry; }; const capabilityKeys = ["audio", "image", "video"] as const;