From 7189b49f818f55373d9b06216605e0ead7430849 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 22 Apr 2026 05:28:48 +0100 Subject: [PATCH] refactor: move media defaults into plugin manifests --- docs/plugins/manifest.md | 98 +++++++++++----- extensions/anthropic/openclaw.plugin.json | 12 ++ extensions/deepgram/openclaw.plugin.json | 11 ++ extensions/google/openclaw.plugin.json | 16 +++ extensions/groq/openclaw.plugin.json | 11 ++ extensions/minimax/openclaw.plugin.json | 20 ++++ extensions/mistral/openclaw.plugin.json | 11 ++ extensions/moonshot/openclaw.plugin.json | 12 ++ extensions/openai/openclaw.plugin.json | 19 +++ extensions/openrouter/openclaw.plugin.json | 8 ++ extensions/qwen/openclaw.plugin.json | 12 ++ extensions/zai/openclaw.plugin.json | 11 ++ src/agents/tools/pdf-tool.helpers.ts | 4 +- src/agents/tools/pdf-tool.model-config.ts | 29 +++-- src/media-understanding/bundled-defaults.ts | 109 ------------------ src/media-understanding/defaults.ts | 94 ++++++++------- src/media-understanding/manifest-metadata.ts | 36 ++++++ src/plugins/manifest-registry.test.ts | 48 ++++++++ src/plugins/manifest-registry.ts | 6 + src/plugins/manifest.ts | 104 +++++++++++++++++ ...hannel-contract-surface-guardrails.test.ts | 7 ++ 21 files changed, 484 insertions(+), 194 deletions(-) delete mode 100644 src/media-understanding/bundled-defaults.ts create mode 100644 src/media-understanding/manifest-metadata.ts diff --git a/docs/plugins/manifest.md b/docs/plugins/manifest.md index 5312d162c7f..4bdfaea915a 100644 --- a/docs/plugins/manifest.md +++ b/docs/plugins/manifest.md @@ -148,36 +148,37 @@ Those belong in your plugin code and `package.json`. ## Top-level field reference -| Field | Required | Type | What it means | -| ----------------------------------- | -------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `id` | Yes | `string` | Canonical plugin id. This is the id used in `plugins.entries.`. | -| `configSchema` | Yes | `object` | Inline JSON Schema for this plugin's config. | -| `enabledByDefault` | No | `true` | Marks a bundled plugin as enabled by default. Omit it, or set any non-`true` value, to leave the plugin disabled by default. | -| `legacyPluginIds` | No | `string[]` | Legacy ids that normalize to this canonical plugin id. | -| `autoEnableWhenConfiguredProviders` | No | `string[]` | Provider ids that should auto-enable this plugin when auth, config, or model refs mention them. | -| `kind` | No | `"memory"` \| `"context-engine"` | Declares an exclusive plugin kind used by `plugins.slots.*`. | -| `channels` | No | `string[]` | Channel ids owned by this plugin. Used for discovery and config validation. | -| `providers` | No | `string[]` | Provider ids owned by this plugin. | -| `modelSupport` | No | `object` | Manifest-owned shorthand model-family metadata used to auto-load the plugin before runtime. | -| `providerEndpoints` | No | `object[]` | Manifest-owned endpoint host/baseUrl metadata for provider routes that core must classify before provider runtime loads. | -| `cliBackends` | No | `string[]` | CLI inference backend ids owned by this plugin. Used for startup auto-activation from explicit config refs. | -| `syntheticAuthRefs` | No | `string[]` | Provider or CLI backend refs whose plugin-owned synthetic auth hook should be probed during cold model discovery before runtime loads. | -| `nonSecretAuthMarkers` | No | `string[]` | Bundled-plugin-owned placeholder API key values that represent non-secret local, OAuth, or ambient credential state. | -| `commandAliases` | No | `object[]` | Command names owned by this plugin that should produce plugin-aware config and CLI diagnostics before runtime loads. | -| `providerAuthEnvVars` | No | `Record` | Cheap provider-auth env metadata that OpenClaw can inspect without loading plugin code. | -| `providerAuthAliases` | No | `Record` | Provider ids that should reuse another provider id for auth lookup, for example a coding provider that shares the base provider API key and auth profiles. | -| `channelEnvVars` | No | `Record` | Cheap channel env metadata that OpenClaw can inspect without loading plugin code. Use this for env-driven channel setup or auth surfaces that generic startup/config helpers should see. | -| `providerAuthChoices` | No | `object[]` | Cheap auth-choice metadata for onboarding pickers, preferred-provider resolution, and simple CLI flag wiring. | -| `activation` | No | `object` | Cheap activation hints for provider, command, channel, route, and capability-triggered loading. Metadata only; plugin runtime still owns actual behavior. | -| `setup` | No | `object` | Cheap setup/onboarding descriptors that discovery and setup surfaces can inspect without loading plugin runtime. | -| `qaRunners` | No | `object[]` | Cheap QA runner descriptors used by the shared `openclaw qa` host before plugin runtime loads. | -| `contracts` | No | `object` | Static bundled capability snapshot for speech, realtime transcription, realtime voice, media-understanding, image-generation, music-generation, video-generation, web-fetch, web search, and tool ownership. | -| `channelConfigs` | No | `Record` | Manifest-owned channel config metadata merged into discovery and validation surfaces before runtime loads. | -| `skills` | No | `string[]` | Skill directories to load, relative to the plugin root. | -| `name` | No | `string` | Human-readable plugin name. | -| `description` | No | `string` | Short summary shown in plugin surfaces. | -| `version` | No | `string` | Informational plugin version. | -| `uiHints` | No | `Record` | UI labels, placeholders, and sensitivity hints for config fields. | +| Field | Required | Type | What it means | +| ------------------------------------ | -------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `id` | Yes | `string` | Canonical plugin id. This is the id used in `plugins.entries.`. | +| `configSchema` | Yes | `object` | Inline JSON Schema for this plugin's config. | +| `enabledByDefault` | No | `true` | Marks a bundled plugin as enabled by default. Omit it, or set any non-`true` value, to leave the plugin disabled by default. | +| `legacyPluginIds` | No | `string[]` | Legacy ids that normalize to this canonical plugin id. | +| `autoEnableWhenConfiguredProviders` | No | `string[]` | Provider ids that should auto-enable this plugin when auth, config, or model refs mention them. | +| `kind` | No | `"memory"` \| `"context-engine"` | Declares an exclusive plugin kind used by `plugins.slots.*`. | +| `channels` | No | `string[]` | Channel ids owned by this plugin. Used for discovery and config validation. | +| `providers` | No | `string[]` | Provider ids owned by this plugin. | +| `modelSupport` | No | `object` | Manifest-owned shorthand model-family metadata used to auto-load the plugin before runtime. | +| `providerEndpoints` | No | `object[]` | Manifest-owned endpoint host/baseUrl metadata for provider routes that core must classify before provider runtime loads. | +| `cliBackends` | No | `string[]` | CLI inference backend ids owned by this plugin. Used for startup auto-activation from explicit config refs. | +| `syntheticAuthRefs` | No | `string[]` | Provider or CLI backend refs whose plugin-owned synthetic auth hook should be probed during cold model discovery before runtime loads. | +| `nonSecretAuthMarkers` | No | `string[]` | Bundled-plugin-owned placeholder API key values that represent non-secret local, OAuth, or ambient credential state. | +| `commandAliases` | No | `object[]` | Command names owned by this plugin that should produce plugin-aware config and CLI diagnostics before runtime loads. | +| `providerAuthEnvVars` | No | `Record` | Cheap provider-auth env metadata that OpenClaw can inspect without loading plugin code. | +| `providerAuthAliases` | No | `Record` | Provider ids that should reuse another provider id for auth lookup, for example a coding provider that shares the base provider API key and auth profiles. | +| `channelEnvVars` | No | `Record` | Cheap channel env metadata that OpenClaw can inspect without loading plugin code. Use this for env-driven channel setup or auth surfaces that generic startup/config helpers should see. | +| `providerAuthChoices` | No | `object[]` | Cheap auth-choice metadata for onboarding pickers, preferred-provider resolution, and simple CLI flag wiring. | +| `activation` | No | `object` | Cheap activation hints for provider, command, channel, route, and capability-triggered loading. Metadata only; plugin runtime still owns actual behavior. | +| `setup` | No | `object` | Cheap setup/onboarding descriptors that discovery and setup surfaces can inspect without loading plugin runtime. | +| `qaRunners` | No | `object[]` | Cheap QA runner descriptors used by the shared `openclaw qa` host before plugin runtime loads. | +| `contracts` | No | `object` | Static bundled capability snapshot for speech, realtime transcription, realtime voice, media-understanding, image-generation, music-generation, video-generation, web-fetch, web search, and tool ownership. | +| `mediaUnderstandingProviderMetadata` | No | `Record` | Cheap media-understanding defaults for provider ids declared in `contracts.mediaUnderstandingProviders`. | +| `channelConfigs` | No | `Record` | Manifest-owned channel config metadata merged into discovery and validation surfaces before runtime loads. | +| `skills` | No | `string[]` | Skill directories to load, relative to the plugin root. | +| `name` | No | `string` | Human-readable plugin name. | +| `description` | No | `string` | Short summary shown in plugin surfaces. | +| `version` | No | `string` | Informational plugin version. | +| `uiHints` | No | `Record` | UI labels, placeholders, and sensitivity hints for config fields. | ## providerAuthChoices reference @@ -408,6 +409,43 @@ Each list is optional: | `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. | | `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. | +## mediaUnderstandingProviderMetadata reference + +Use `mediaUnderstandingProviderMetadata` when a media-understanding provider has +default models, auto-auth fallback priority, or native document support that +generic core helpers need before runtime loads. Keys must also be declared in +`contracts.mediaUnderstandingProviders`. + +```json +{ + "contracts": { + "mediaUnderstandingProviders": ["example"] + }, + "mediaUnderstandingProviderMetadata": { + "example": { + "capabilities": ["image", "audio"], + "defaultModels": { + "image": "example-vision-latest", + "audio": "example-transcribe-latest" + }, + "autoPriority": { + "image": 40 + }, + "nativeDocumentInputs": ["pdf"] + } + } +} +``` + +Each provider entry can include: + +| Field | Type | What it means | +| ---------------------- | ----------------------------------- | ---------------------------------------------------------------------------- | +| `capabilities` | `("image" \| "audio" \| "video")[]` | Media capabilities exposed by this provider. | +| `defaultModels` | `Record` | Capability-to-model defaults used when config does not specify a model. | +| `autoPriority` | `Record` | Lower numbers sort earlier for automatic credential-based provider fallback. | +| `nativeDocumentInputs` | `"pdf"[]` | Native document inputs supported by the provider. | + ## channelConfigs reference Use `channelConfigs` when a channel plugin needs cheap config metadata before diff --git a/extensions/anthropic/openclaw.plugin.json b/extensions/anthropic/openclaw.plugin.json index 95bbd2c2a27..4e3e959367a 100644 --- a/extensions/anthropic/openclaw.plugin.json +++ b/extensions/anthropic/openclaw.plugin.json @@ -40,6 +40,18 @@ "contracts": { "mediaUnderstandingProviders": ["anthropic"] }, + "mediaUnderstandingProviderMetadata": { + "anthropic": { + "capabilities": ["image"], + "defaultModels": { + "image": "claude-opus-4-7" + }, + "autoPriority": { + "image": 20 + }, + "nativeDocumentInputs": ["pdf"] + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/deepgram/openclaw.plugin.json b/extensions/deepgram/openclaw.plugin.json index d8d1e872154..6c0c7fb2012 100644 --- a/extensions/deepgram/openclaw.plugin.json +++ b/extensions/deepgram/openclaw.plugin.json @@ -7,6 +7,17 @@ "contracts": { "mediaUnderstandingProviders": ["deepgram"] }, + "mediaUnderstandingProviderMetadata": { + "deepgram": { + "capabilities": ["audio"], + "defaultModels": { + "audio": "nova-3" + }, + "autoPriority": { + "audio": 30 + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/google/openclaw.plugin.json b/extensions/google/openclaw.plugin.json index 85ea4851984..02834c11ee2 100644 --- a/extensions/google/openclaw.plugin.json +++ b/extensions/google/openclaw.plugin.json @@ -53,6 +53,22 @@ "videoGenerationProviders": ["google"], "webSearchProviders": ["gemini"] }, + "mediaUnderstandingProviderMetadata": { + "google": { + "capabilities": ["image", "audio", "video"], + "defaultModels": { + "image": "gemini-3-flash-preview", + "audio": "gemini-3-flash-preview", + "video": "gemini-3-flash-preview" + }, + "autoPriority": { + "image": 30, + "audio": 40, + "video": 10 + }, + "nativeDocumentInputs": ["pdf"] + } + }, "configContracts": { "compatibilityRuntimePaths": ["tools.web.search.apiKey"] }, diff --git a/extensions/groq/openclaw.plugin.json b/extensions/groq/openclaw.plugin.json index 7db47dd7a53..489301a675b 100644 --- a/extensions/groq/openclaw.plugin.json +++ b/extensions/groq/openclaw.plugin.json @@ -7,6 +7,17 @@ "contracts": { "mediaUnderstandingProviders": ["groq"] }, + "mediaUnderstandingProviderMetadata": { + "groq": { + "capabilities": ["audio"], + "defaultModels": { + "audio": "whisper-large-v3-turbo" + }, + "autoPriority": { + "audio": 20 + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/minimax/openclaw.plugin.json b/extensions/minimax/openclaw.plugin.json index 9484fe65fb4..d21933bbbc6 100644 --- a/extensions/minimax/openclaw.plugin.json +++ b/extensions/minimax/openclaw.plugin.json @@ -72,6 +72,26 @@ "configContracts": { "compatibilityRuntimePaths": ["tools.web.search.apiKey"] }, + "mediaUnderstandingProviderMetadata": { + "minimax": { + "capabilities": ["image"], + "defaultModels": { + "image": "MiniMax-VL-01" + }, + "autoPriority": { + "image": 40 + } + }, + "minimax-portal": { + "capabilities": ["image"], + "defaultModels": { + "image": "MiniMax-VL-01" + }, + "autoPriority": { + "image": 50 + } + } + }, "uiHints": { "webSearch.apiKey": { "label": "MiniMax Coding Plan key", diff --git a/extensions/mistral/openclaw.plugin.json b/extensions/mistral/openclaw.plugin.json index 53bf7e40e98..6f75022380e 100644 --- a/extensions/mistral/openclaw.plugin.json +++ b/extensions/mistral/openclaw.plugin.json @@ -24,6 +24,17 @@ "memoryEmbeddingProviders": ["mistral"], "mediaUnderstandingProviders": ["mistral"] }, + "mediaUnderstandingProviderMetadata": { + "mistral": { + "capabilities": ["audio"], + "defaultModels": { + "audio": "voxtral-mini-latest" + }, + "autoPriority": { + "audio": 50 + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/moonshot/openclaw.plugin.json b/extensions/moonshot/openclaw.plugin.json index 0ded65b8c34..827af566e9d 100644 --- a/extensions/moonshot/openclaw.plugin.json +++ b/extensions/moonshot/openclaw.plugin.json @@ -52,6 +52,18 @@ "mediaUnderstandingProviders": ["moonshot"], "webSearchProviders": ["kimi"] }, + "mediaUnderstandingProviderMetadata": { + "moonshot": { + "capabilities": ["image", "video"], + "defaultModels": { + "image": "kimi-k2.6", + "video": "kimi-k2.6" + }, + "autoPriority": { + "video": 20 + } + } + }, "configContracts": { "compatibilityRuntimePaths": ["tools.web.search.apiKey"] }, diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json index f6fd55ffc15..1ed00df3455 100644 --- a/extensions/openai/openclaw.plugin.json +++ b/extensions/openai/openclaw.plugin.json @@ -44,6 +44,25 @@ "imageGenerationProviders": ["openai"], "videoGenerationProviders": ["openai"] }, + "mediaUnderstandingProviderMetadata": { + "openai": { + "capabilities": ["image", "audio"], + "defaultModels": { + "image": "gpt-5.4-mini", + "audio": "gpt-4o-transcribe" + }, + "autoPriority": { + "image": 10, + "audio": 10 + } + }, + "openai-codex": { + "capabilities": ["image"], + "defaultModels": { + "image": "gpt-5.4" + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/openrouter/openclaw.plugin.json b/extensions/openrouter/openclaw.plugin.json index 00a9633678d..8dcadde6331 100644 --- a/extensions/openrouter/openclaw.plugin.json +++ b/extensions/openrouter/openclaw.plugin.json @@ -23,6 +23,14 @@ "contracts": { "mediaUnderstandingProviders": ["openrouter"] }, + "mediaUnderstandingProviderMetadata": { + "openrouter": { + "capabilities": ["image"], + "defaultModels": { + "image": "auto" + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/qwen/openclaw.plugin.json b/extensions/qwen/openclaw.plugin.json index e440f9f9204..8c3c3eaddc5 100644 --- a/extensions/qwen/openclaw.plugin.json +++ b/extensions/qwen/openclaw.plugin.json @@ -6,6 +6,18 @@ "mediaUnderstandingProviders": ["qwen"], "videoGenerationProviders": ["qwen"] }, + "mediaUnderstandingProviderMetadata": { + "qwen": { + "capabilities": ["image", "video"], + "defaultModels": { + "image": "qwen-vl-max-latest", + "video": "qwen-vl-max-latest" + }, + "autoPriority": { + "video": 15 + } + } + }, "providerAuthEnvVars": { "qwen": ["QWEN_API_KEY", "MODELSTUDIO_API_KEY", "DASHSCOPE_API_KEY"] }, diff --git a/extensions/zai/openclaw.plugin.json b/extensions/zai/openclaw.plugin.json index f7d5838f54d..5a8a097dd70 100644 --- a/extensions/zai/openclaw.plugin.json +++ b/extensions/zai/openclaw.plugin.json @@ -79,6 +79,17 @@ "contracts": { "mediaUnderstandingProviders": ["zai"] }, + "mediaUnderstandingProviderMetadata": { + "zai": { + "capabilities": ["image"], + "defaultModels": { + "image": "glm-4.6v" + }, + "autoPriority": { + "image": 60 + } + } + }, "configSchema": { "type": "object", "additionalProperties": false, diff --git a/src/agents/tools/pdf-tool.helpers.ts b/src/agents/tools/pdf-tool.helpers.ts index d7c0e8bbbb3..055902eebf4 100644 --- a/src/agents/tools/pdf-tool.helpers.ts +++ b/src/agents/tools/pdf-tool.helpers.ts @@ -4,7 +4,7 @@ import { resolveAgentModelPrimaryValue, } from "../../config/model-input.js"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; -import { bundledProviderSupportsNativePdfDocument } from "../../media-understanding/bundled-defaults.js"; +import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; export type PdfModelConfig = { primary?: string; fallbacks?: string[] }; @@ -38,7 +38,7 @@ export function resolvePdfInputs(record: Record): string[] { * Check whether a provider supports native PDF document input. */ export function providerSupportsNativePdf(provider: string): boolean { - return bundledProviderSupportsNativePdfDocument(provider); + return providerSupportsNativePdfDocument({ providerId: provider }); } /** diff --git a/src/agents/tools/pdf-tool.model-config.ts b/src/agents/tools/pdf-tool.model-config.ts index 51b1ffad473..c6ac810de81 100644 --- a/src/agents/tools/pdf-tool.model-config.ts +++ b/src/agents/tools/pdf-tool.model-config.ts @@ -1,9 +1,9 @@ import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { - bundledProviderSupportsNativePdfDocument, - resolveBundledAutoMediaKeyProviders, - resolveBundledDefaultMediaModel, -} from "../../media-understanding/bundled-defaults.js"; + providerSupportsNativePdfDocument, + resolveAutoMediaKeyProviders, + resolveDefaultMediaModel, +} from "../../media-understanding/defaults.js"; import { coerceImageModelConfig, type ImageModelConfig, @@ -12,12 +12,12 @@ import { import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js"; import { coercePdfModelConfig } from "./pdf-tool.helpers.js"; -function resolveBundledImageCandidateRefs(params: { +function resolveImageCandidateRefs(params: { cfg?: OpenClawConfig; agentDir: string; filter?: (providerId: string) => boolean; }): string[] { - return resolveBundledAutoMediaKeyProviders("image") + return resolveAutoMediaKeyProviders({ capability: "image", cfg: params.cfg }) .filter((providerId) => !params.filter || params.filter(providerId)) .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) .map((providerId) => { @@ -26,7 +26,8 @@ function resolveBundledImageCandidateRefs(params: { cfg: params.cfg, provider: providerId, })?.split("/")[1] ?? - resolveBundledDefaultMediaModel({ + resolveDefaultMediaModel({ + cfg: params.cfg, providerId, capability: "image", }); @@ -69,17 +70,21 @@ export function resolvePdfModelConfigForTool(params: { }); const providerDefault = providerVision?.split("/")[1] ?? - resolveBundledDefaultMediaModel({ + resolveDefaultMediaModel({ + cfg: params.cfg, providerId: primary.provider, capability: "image", }); - const primarySupportsNativePdf = bundledProviderSupportsNativePdfDocument(primary.provider); - const nativePdfCandidates = resolveBundledImageCandidateRefs({ + const primarySupportsNativePdf = providerSupportsNativePdfDocument({ + cfg: params.cfg, + providerId: primary.provider, + }); + const nativePdfCandidates = resolveImageCandidateRefs({ cfg: params.cfg, agentDir: params.agentDir, - filter: bundledProviderSupportsNativePdfDocument, + filter: (providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }), }); - const genericImageCandidates = resolveBundledImageCandidateRefs({ + const genericImageCandidates = resolveImageCandidateRefs({ cfg: params.cfg, agentDir: params.agentDir, }); diff --git a/src/media-understanding/bundled-defaults.ts b/src/media-understanding/bundled-defaults.ts deleted file mode 100644 index 7feefc9ffff..00000000000 --- a/src/media-understanding/bundled-defaults.ts +++ /dev/null @@ -1,109 +0,0 @@ -import { normalizeMediaProviderId } from "./provider-id.js"; -import type { MediaUnderstandingCapability } from "./types.js"; - -type BundledMediaProviderDefaults = { - defaultModels?: Partial>; - autoPriority?: Partial>; - nativeDocumentInputs?: Array<"pdf">; -}; - -const BUNDLED_MEDIA_PROVIDER_DEFAULTS: Record = { - openai: { - defaultModels: { image: "gpt-5.4-mini", audio: "gpt-4o-transcribe" }, - autoPriority: { image: 10, audio: 10 }, - }, - "openai-codex": { - defaultModels: { image: "gpt-5.4" }, - }, - anthropic: { - defaultModels: { image: "claude-opus-4-7" }, - autoPriority: { image: 20 }, - nativeDocumentInputs: ["pdf"], - }, - google: { - defaultModels: { - image: "gemini-3-flash-preview", - audio: "gemini-3-flash-preview", - video: "gemini-3-flash-preview", - }, - autoPriority: { image: 30, audio: 40, video: 10 }, - nativeDocumentInputs: ["pdf"], - }, - groq: { - defaultModels: { audio: "whisper-large-v3-turbo" }, - autoPriority: { audio: 20 }, - }, - deepgram: { - defaultModels: { audio: "nova-3" }, - autoPriority: { audio: 30 }, - }, - mistral: { - defaultModels: { audio: "voxtral-mini-latest" }, - autoPriority: { audio: 50 }, - }, - minimax: { - defaultModels: { image: "MiniMax-VL-01" }, - autoPriority: { image: 40 }, - }, - "minimax-portal": { - defaultModels: { image: "MiniMax-VL-01" }, - autoPriority: { image: 50 }, - }, - zai: { - defaultModels: { image: "glm-4.6v" }, - autoPriority: { image: 60 }, - }, - qwen: { - defaultModels: { image: "qwen-vl-max-latest", video: "qwen-vl-max-latest" }, - autoPriority: { video: 15 }, - }, - moonshot: { - defaultModels: { image: "kimi-k2.6", video: "kimi-k2.6" }, - autoPriority: { video: 20 }, - }, - openrouter: { - defaultModels: { image: "auto" }, - }, -}; - -export function getBundledMediaProviderDefaults( - providerId: string, -): BundledMediaProviderDefaults | null { - return BUNDLED_MEDIA_PROVIDER_DEFAULTS[normalizeMediaProviderId(providerId)] ?? null; -} - -export function resolveBundledDefaultMediaModel(params: { - providerId: string; - capability: MediaUnderstandingCapability; -}): string | undefined { - return getBundledMediaProviderDefaults(params.providerId)?.defaultModels?.[ - params.capability - ]?.trim(); -} - -export function resolveBundledAutoMediaKeyProviders( - capability: MediaUnderstandingCapability, -): string[] { - return Object.entries(BUNDLED_MEDIA_PROVIDER_DEFAULTS) - .map(([providerId, defaults]) => ({ - providerId, - priority: defaults.autoPriority?.[capability], - })) - .filter( - (entry): entry is { providerId: string; priority: number } => - typeof entry.priority === "number", - ) - .toSorted((left, right) => { - if (left.priority !== right.priority) { - return left.priority - right.priority; - } - return left.providerId.localeCompare(right.providerId); - }) - .map((entry) => entry.providerId); -} - -export function bundledProviderSupportsNativePdfDocument(providerId: string): boolean { - return ( - getBundledMediaProviderDefaults(providerId)?.nativeDocumentInputs?.includes("pdf") ?? false - ); -} diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 20be331daff..3c7c544cbb3 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -1,11 +1,7 @@ import type { OpenClawConfig } from "../config/types.js"; import { normalizeOptionalString } from "../shared/string-coerce.js"; -import { - bundledProviderSupportsNativePdfDocument, - resolveBundledAutoMediaKeyProviders, - resolveBundledDefaultMediaModel, -} from "./bundled-defaults.js"; -import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js"; +import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js"; +import { normalizeMediaProviderId } from "./provider-registry.js"; import { providerSupportsCapability } from "./provider-supports.js"; import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; @@ -39,8 +35,30 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; +let defaultRegistryCache: Map | null = null; +const configRegistryCache = new WeakMap>(); + function resolveDefaultRegistry(cfg?: OpenClawConfig) { - return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig)); + if (!cfg) { + defaultRegistryCache ??= buildMediaUnderstandingManifestMetadataRegistry(); + return defaultRegistryCache; + } + const cached = configRegistryCache.get(cfg); + if (cached) { + return cached; + } + const registry = buildMediaUnderstandingManifestMetadataRegistry(cfg); + configRegistryCache.set(cfg, registry); + return registry; +} + +function providerHasDeclaredCapability( + provider: MediaUnderstandingProvider | undefined, + capability: MediaUnderstandingCapability, +): boolean { + return ( + provider?.capabilities?.includes(capability) ?? providerSupportsCapability(provider, capability) + ); } function resolveConfiguredImageProviderModel(params: { @@ -68,6 +86,28 @@ function resolveConfiguredImageProviderModel(params: { return undefined; } +function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] { + const providers = cfg?.models?.providers; + if (!providers || typeof providers !== "object") { + return []; + } + const configured: string[] = []; + for (const [providerKey, providerCfg] of Object.entries(providers)) { + const normalizedProviderId = normalizeMediaProviderId(providerKey); + if (!normalizedProviderId || configured.includes(normalizedProviderId)) { + continue; + } + const models = providerCfg?.models ?? []; + const hasImageModel = models.some( + (model) => Array.isArray(model?.input) && model.input.includes("image"), + ); + if (hasImageModel) { + configured.push(normalizedProviderId); + } + } + return configured; +} + export function resolveDefaultMediaModel(params: { providerId: string; capability: MediaUnderstandingCapability; @@ -85,13 +125,6 @@ export function resolveDefaultMediaModel(params: { if (configuredImageModel) { return configuredImageModel; } - const bundledDefault = resolveBundledDefaultMediaModel({ - providerId: params.providerId, - capability: params.capability, - }); - if (bundledDefault) { - return bundledDefault; - } } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); const provider = registry.get(normalizeMediaProviderId(params.providerId)); @@ -103,35 +136,13 @@ export function resolveAutoMediaKeyProviders(params: { cfg?: OpenClawConfig; providerRegistry?: Map; }): string[] { - if (!params.providerRegistry) { - const bundledProviders = resolveBundledAutoMediaKeyProviders(params.capability); - if (params.capability !== "image") { - return bundledProviders; - } - const configProviders = params.cfg?.models?.providers; - if (!configProviders || typeof configProviders !== "object") { - return bundledProviders; - } - const merged = [...bundledProviders]; - for (const [providerKey, providerCfg] of Object.entries(configProviders)) { - const normalizedProviderId = normalizeMediaProviderId(providerKey); - const models = providerCfg?.models ?? []; - const hasImageModel = models.some( - (model) => Array.isArray(model?.input) && model.input.includes("image"), - ); - if (hasImageModel && !merged.includes(normalizedProviderId)) { - merged.push(normalizedProviderId); - } - } - return merged; - } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); type AutoProviderEntry = { provider: MediaUnderstandingProvider; priority: number; }; - return [...registry.values()] - .filter((provider) => providerSupportsCapability(provider, params.capability)) + const prioritized = [...registry.values()] + .filter((provider) => providerHasDeclaredCapability(provider, params.capability)) .map((provider): AutoProviderEntry | null => { const priority = provider.autoPriority?.[params.capability]; return typeof priority === "number" && Number.isFinite(priority) @@ -147,6 +158,10 @@ export function resolveAutoMediaKeyProviders(params: { }) .map((entry) => normalizeMediaProviderId(entry.provider.id)) .filter(Boolean); + if (params.providerRegistry || params.capability !== "image") { + return prioritized; + } + return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])]; } export function providerSupportsNativePdfDocument(params: { @@ -154,9 +169,6 @@ export function providerSupportsNativePdfDocument(params: { cfg?: OpenClawConfig; providerRegistry?: Map; }): boolean { - if (!params.providerRegistry && bundledProviderSupportsNativePdfDocument(params.providerId)) { - return true; - } const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); const provider = registry.get(normalizeMediaProviderId(params.providerId)); return provider?.nativeDocumentInputs?.includes("pdf") ?? false; diff --git a/src/media-understanding/manifest-metadata.ts b/src/media-understanding/manifest-metadata.ts new file mode 100644 index 00000000000..a993679f945 --- /dev/null +++ b/src/media-understanding/manifest-metadata.ts @@ -0,0 +1,36 @@ +import type { OpenClawConfig } from "../config/types.js"; +import { loadPluginManifestRegistry } from "../plugins/manifest-registry.js"; +import { normalizeMediaProviderId } from "./provider-id.js"; +import type { MediaUnderstandingProvider } from "./types.js"; + +export function buildMediaUnderstandingManifestMetadataRegistry( + cfg?: OpenClawConfig, +): Map { + const registry = new Map(); + for (const plugin of loadPluginManifestRegistry({ + config: cfg, + env: process.env, + }).plugins) { + const declaredProviders = new Set( + (plugin.contracts?.mediaUnderstandingProviders ?? []).map((providerId) => + normalizeMediaProviderId(providerId), + ), + ); + for (const [providerId, metadata] of Object.entries( + plugin.mediaUnderstandingProviderMetadata ?? {}, + )) { + const normalizedProviderId = normalizeMediaProviderId(providerId); + if (!normalizedProviderId || !declaredProviders.has(normalizedProviderId)) { + continue; + } + registry.set(normalizedProviderId, { + id: normalizedProviderId, + capabilities: metadata.capabilities, + defaultModels: metadata.defaultModels, + autoPriority: metadata.autoPriority, + nativeDocumentInputs: metadata.nativeDocumentInputs, + }); + } + } + return registry; +} diff --git a/src/plugins/manifest-registry.test.ts b/src/plugins/manifest-registry.test.ts index 0ae869a4479..c98ad028bae 100644 --- a/src/plugins/manifest-registry.test.ts +++ b/src/plugins/manifest-registry.test.ts @@ -532,6 +532,54 @@ describe("loadPluginManifestRegistry", () => { }); }); + it("preserves media-understanding provider metadata from plugin manifests", () => { + const dir = makeTempDir(); + writeManifest(dir, { + id: "openai", + contracts: { + mediaUnderstandingProviders: ["openai"], + }, + mediaUnderstandingProviderMetadata: { + openai: { + capabilities: ["image", "audio", "unknown"], + defaultModels: { + image: "gpt-5.4-mini", + audio: "gpt-4o-transcribe", + unknown: "ignored", + }, + autoPriority: { + image: 10, + audio: 20, + video: "ignored", + }, + nativeDocumentInputs: ["pdf", "docx"], + }, + }, + configSchema: { type: "object" }, + }); + + const registry = loadSingleCandidateRegistry({ + idHint: "openai", + rootDir: dir, + origin: "bundled", + }); + + expect(registry.plugins[0]?.mediaUnderstandingProviderMetadata).toEqual({ + openai: { + capabilities: ["image", "audio"], + defaultModels: { + image: "gpt-5.4-mini", + audio: "gpt-4o-transcribe", + }, + autoPriority: { + image: 10, + audio: 20, + }, + nativeDocumentInputs: ["pdf"], + }, + }); + }); + it("preserves channel env metadata from plugin manifests", () => { const dir = makeTempDir(); writeManifest(dir, { diff --git a/src/plugins/manifest-registry.ts b/src/plugins/manifest-registry.ts index 7e5e187804e..fb02c065f8a 100644 --- a/src/plugins/manifest-registry.ts +++ b/src/plugins/manifest-registry.ts @@ -33,6 +33,7 @@ import { type PluginManifest, type PluginManifestChannelConfig, type PluginManifestContracts, + type PluginManifestMediaUnderstandingProviderMetadata, type PluginManifestModelSupport, type PluginManifestProviderEndpoint, type PluginManifestQaRunner, @@ -112,6 +113,10 @@ export type PluginManifestRecord = { configSchema?: Record; configUiHints?: Record; contracts?: PluginManifestContracts; + mediaUnderstandingProviderMetadata?: Record< + string, + PluginManifestMediaUnderstandingProviderMetadata + >; configContracts?: PluginManifestConfigContracts; channelConfigs?: Record; channelCatalogMeta?: { @@ -359,6 +364,7 @@ function buildRecord(params: { configSchema: params.configSchema, configUiHints: params.manifest.uiHints, contracts: params.manifest.contracts, + mediaUnderstandingProviderMetadata: params.manifest.mediaUnderstandingProviderMetadata, configContracts: params.manifest.configContracts, channelConfigs, ...(params.candidate.packageManifest?.channel?.id diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts index 2b319ce1092..91e131a1585 100644 --- a/src/plugins/manifest.ts +++ b/src/plugins/manifest.ts @@ -219,6 +219,11 @@ export type PluginManifest = { * compat wiring, and contract coverage without importing plugin runtime. */ contracts?: PluginManifestContracts; + /** Cheap media-understanding provider defaults without importing plugin runtime. */ + mediaUnderstandingProviderMetadata?: Record< + string, + PluginManifestMediaUnderstandingProviderMetadata + >; /** Manifest-owned config behavior consumed by generic core helpers. */ configContracts?: PluginManifestConfigContracts; channelConfigs?: Record; @@ -238,6 +243,15 @@ export type PluginManifestContracts = { tools?: string[]; }; +export type PluginManifestMediaUnderstandingCapability = "image" | "audio" | "video"; + +export type PluginManifestMediaUnderstandingProviderMetadata = { + capabilities?: PluginManifestMediaUnderstandingCapability[]; + defaultModels?: Partial>; + autoPriority?: Partial>; + nativeDocumentInputs?: Array<"pdf">; +}; + export type PluginManifestProviderAuthChoice = { /** Provider id owned by this manifest entry. */ provider: string; @@ -311,6 +325,92 @@ function normalizeStringRecord(value: unknown): Record | undefin return Object.keys(normalized).length > 0 ? normalized : undefined; } +const MEDIA_UNDERSTANDING_CAPABILITIES = new Set(["image", "audio", "video"]); + +function normalizeMediaUnderstandingCapabilityRecord( + value: unknown, +): Partial> | undefined { + if (!isRecord(value)) { + return undefined; + } + const normalized: Partial> = {}; + for (const [rawKey, rawValue] of Object.entries(value)) { + if (!MEDIA_UNDERSTANDING_CAPABILITIES.has(rawKey)) { + continue; + } + const model = normalizeOptionalString(rawValue); + if (model) { + normalized[rawKey as PluginManifestMediaUnderstandingCapability] = model; + } + } + return Object.keys(normalized).length > 0 ? normalized : undefined; +} + +function normalizeMediaUnderstandingPriorityRecord( + value: unknown, +): Partial> | undefined { + if (!isRecord(value)) { + return undefined; + } + const normalized: Partial> = {}; + for (const [rawKey, rawValue] of Object.entries(value)) { + if ( + !MEDIA_UNDERSTANDING_CAPABILITIES.has(rawKey) || + typeof rawValue !== "number" || + !Number.isFinite(rawValue) + ) { + continue; + } + normalized[rawKey as PluginManifestMediaUnderstandingCapability] = rawValue; + } + return Object.keys(normalized).length > 0 ? normalized : undefined; +} + +function normalizeMediaUnderstandingCapabilities( + value: unknown, +): PluginManifestMediaUnderstandingCapability[] | undefined { + const values = normalizeTrimmedStringList(value).filter((entry) => + MEDIA_UNDERSTANDING_CAPABILITIES.has(entry), + ) as PluginManifestMediaUnderstandingCapability[]; + return values.length > 0 ? values : undefined; +} + +function normalizeMediaUnderstandingNativeDocumentInputs(value: unknown): Array<"pdf"> | undefined { + const values = normalizeTrimmedStringList(value).filter((entry) => entry === "pdf"); + return values.length > 0 ? values : undefined; +} + +function normalizeMediaUnderstandingProviderMetadata( + value: unknown, +): Record | undefined { + if (!isRecord(value)) { + return undefined; + } + const normalized: Record = {}; + for (const [rawProviderId, rawMetadata] of Object.entries(value)) { + const providerId = normalizeOptionalString(rawProviderId) ?? ""; + if (!providerId || !isRecord(rawMetadata)) { + continue; + } + const capabilities = normalizeMediaUnderstandingCapabilities(rawMetadata.capabilities); + const defaultModels = normalizeMediaUnderstandingCapabilityRecord(rawMetadata.defaultModels); + const autoPriority = normalizeMediaUnderstandingPriorityRecord(rawMetadata.autoPriority); + const nativeDocumentInputs = normalizeMediaUnderstandingNativeDocumentInputs( + rawMetadata.nativeDocumentInputs, + ); + const metadata = { + ...(capabilities ? { capabilities } : {}), + ...(defaultModels ? { defaultModels } : {}), + ...(autoPriority ? { autoPriority } : {}), + ...(nativeDocumentInputs ? { nativeDocumentInputs } : {}), + } satisfies PluginManifestMediaUnderstandingProviderMetadata; + if (Object.keys(metadata).length > 0) { + normalized[providerId] = metadata; + } + } + return Object.keys(normalized).length > 0 ? normalized : undefined; +} + function normalizeManifestContracts(value: unknown): PluginManifestContracts | undefined { if (!isRecord(value)) { return undefined; @@ -769,6 +869,9 @@ export function loadPluginManifest( const qaRunners = normalizeManifestQaRunners(raw.qaRunners); const skills = normalizeTrimmedStringList(raw.skills); const contracts = normalizeManifestContracts(raw.contracts); + const mediaUnderstandingProviderMetadata = normalizeMediaUnderstandingProviderMetadata( + raw.mediaUnderstandingProviderMetadata, + ); const configContracts = normalizeManifestConfigContracts(raw.configContracts); const channelConfigs = normalizeChannelConfigs(raw.channelConfigs); @@ -810,6 +913,7 @@ export function loadPluginManifest( version, uiHints, contracts, + mediaUnderstandingProviderMetadata, configContracts, channelConfigs, }, diff --git a/src/secrets/channel-contract-surface-guardrails.test.ts b/src/secrets/channel-contract-surface-guardrails.test.ts index 38a3f04824d..876e8ab83d2 100644 --- a/src/secrets/channel-contract-surface-guardrails.test.ts +++ b/src/secrets/channel-contract-surface-guardrails.test.ts @@ -54,6 +54,13 @@ const CORE_SECRET_SURFACE_GUARDS = [ path: "src/gateway/channel-health-policy.ts", forbiddenPatterns: [/\btelegram\b/], }, + { + path: "src/media-understanding/defaults.ts", + forbiddenPatterns: [ + /\b(?:openai|anthropic|google|groq|deepgram|mistral|minimax|zai|qwen|moonshot|openrouter)\b/, + /\b(?:gpt-|claude-|gemini-|whisper-|nova-|voxtral-|MiniMax-|glm-|qwen-|kimi-)\b/, + ], + }, ] as const; describe("channel secret contract surface guardrails", () => {