refactor: move media defaults into plugin manifests

This commit is contained in:
Peter Steinberger
2026-04-22 05:28:48 +01:00
parent 2e775fb03e
commit 7189b49f81
21 changed files with 484 additions and 194 deletions

View File

@@ -148,36 +148,37 @@ Those belong in your plugin code and `package.json`.
## Top-level field reference
| Field | Required | Type | What it means |
| ----------------------------------- | -------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `id` | Yes | `string` | Canonical plugin id. This is the id used in `plugins.entries.<id>`. |
| `configSchema` | Yes | `object` | Inline JSON Schema for this plugin's config. |
| `enabledByDefault` | No | `true` | Marks a bundled plugin as enabled by default. Omit it, or set any non-`true` value, to leave the plugin disabled by default. |
| `legacyPluginIds` | No | `string[]` | Legacy ids that normalize to this canonical plugin id. |
| `autoEnableWhenConfiguredProviders` | No | `string[]` | Provider ids that should auto-enable this plugin when auth, config, or model refs mention them. |
| `kind` | No | `"memory"` \| `"context-engine"` | Declares an exclusive plugin kind used by `plugins.slots.*`. |
| `channels` | No | `string[]` | Channel ids owned by this plugin. Used for discovery and config validation. |
| `providers` | No | `string[]` | Provider ids owned by this plugin. |
| `modelSupport` | No | `object` | Manifest-owned shorthand model-family metadata used to auto-load the plugin before runtime. |
| `providerEndpoints` | No | `object[]` | Manifest-owned endpoint host/baseUrl metadata for provider routes that core must classify before provider runtime loads. |
| `cliBackends` | No | `string[]` | CLI inference backend ids owned by this plugin. Used for startup auto-activation from explicit config refs. |
| `syntheticAuthRefs` | No | `string[]` | Provider or CLI backend refs whose plugin-owned synthetic auth hook should be probed during cold model discovery before runtime loads. |
| `nonSecretAuthMarkers` | No | `string[]` | Bundled-plugin-owned placeholder API key values that represent non-secret local, OAuth, or ambient credential state. |
| `commandAliases` | No | `object[]` | Command names owned by this plugin that should produce plugin-aware config and CLI diagnostics before runtime loads. |
| `providerAuthEnvVars` | No | `Record<string, string[]>` | Cheap provider-auth env metadata that OpenClaw can inspect without loading plugin code. |
| `providerAuthAliases` | No | `Record<string, string>` | Provider ids that should reuse another provider id for auth lookup, for example a coding provider that shares the base provider API key and auth profiles. |
| `channelEnvVars` | No | `Record<string, string[]>` | Cheap channel env metadata that OpenClaw can inspect without loading plugin code. Use this for env-driven channel setup or auth surfaces that generic startup/config helpers should see. |
| `providerAuthChoices` | No | `object[]` | Cheap auth-choice metadata for onboarding pickers, preferred-provider resolution, and simple CLI flag wiring. |
| `activation` | No | `object` | Cheap activation hints for provider, command, channel, route, and capability-triggered loading. Metadata only; plugin runtime still owns actual behavior. |
| `setup` | No | `object` | Cheap setup/onboarding descriptors that discovery and setup surfaces can inspect without loading plugin runtime. |
| `qaRunners` | No | `object[]` | Cheap QA runner descriptors used by the shared `openclaw qa` host before plugin runtime loads. |
| `contracts` | No | `object` | Static bundled capability snapshot for speech, realtime transcription, realtime voice, media-understanding, image-generation, music-generation, video-generation, web-fetch, web search, and tool ownership. |
| `channelConfigs` | No | `Record<string, object>` | Manifest-owned channel config metadata merged into discovery and validation surfaces before runtime loads. |
| `skills` | No | `string[]` | Skill directories to load, relative to the plugin root. |
| `name` | No | `string` | Human-readable plugin name. |
| `description` | No | `string` | Short summary shown in plugin surfaces. |
| `version` | No | `string` | Informational plugin version. |
| `uiHints` | No | `Record<string, object>` | UI labels, placeholders, and sensitivity hints for config fields. |
| Field | Required | Type | What it means |
| ------------------------------------ | -------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `id` | Yes | `string` | Canonical plugin id. This is the id used in `plugins.entries.<id>`. |
| `configSchema` | Yes | `object` | Inline JSON Schema for this plugin's config. |
| `enabledByDefault` | No | `true` | Marks a bundled plugin as enabled by default. Omit it, or set any non-`true` value, to leave the plugin disabled by default. |
| `legacyPluginIds` | No | `string[]` | Legacy ids that normalize to this canonical plugin id. |
| `autoEnableWhenConfiguredProviders` | No | `string[]` | Provider ids that should auto-enable this plugin when auth, config, or model refs mention them. |
| `kind` | No | `"memory"` \| `"context-engine"` | Declares an exclusive plugin kind used by `plugins.slots.*`. |
| `channels` | No | `string[]` | Channel ids owned by this plugin. Used for discovery and config validation. |
| `providers` | No | `string[]` | Provider ids owned by this plugin. |
| `modelSupport` | No | `object` | Manifest-owned shorthand model-family metadata used to auto-load the plugin before runtime. |
| `providerEndpoints` | No | `object[]` | Manifest-owned endpoint host/baseUrl metadata for provider routes that core must classify before provider runtime loads. |
| `cliBackends` | No | `string[]` | CLI inference backend ids owned by this plugin. Used for startup auto-activation from explicit config refs. |
| `syntheticAuthRefs` | No | `string[]` | Provider or CLI backend refs whose plugin-owned synthetic auth hook should be probed during cold model discovery before runtime loads. |
| `nonSecretAuthMarkers` | No | `string[]` | Bundled-plugin-owned placeholder API key values that represent non-secret local, OAuth, or ambient credential state. |
| `commandAliases` | No | `object[]` | Command names owned by this plugin that should produce plugin-aware config and CLI diagnostics before runtime loads. |
| `providerAuthEnvVars` | No | `Record<string, string[]>` | Cheap provider-auth env metadata that OpenClaw can inspect without loading plugin code. |
| `providerAuthAliases` | No | `Record<string, string>` | Provider ids that should reuse another provider id for auth lookup, for example a coding provider that shares the base provider API key and auth profiles. |
| `channelEnvVars` | No | `Record<string, string[]>` | Cheap channel env metadata that OpenClaw can inspect without loading plugin code. Use this for env-driven channel setup or auth surfaces that generic startup/config helpers should see. |
| `providerAuthChoices` | No | `object[]` | Cheap auth-choice metadata for onboarding pickers, preferred-provider resolution, and simple CLI flag wiring. |
| `activation` | No | `object` | Cheap activation hints for provider, command, channel, route, and capability-triggered loading. Metadata only; plugin runtime still owns actual behavior. |
| `setup` | No | `object` | Cheap setup/onboarding descriptors that discovery and setup surfaces can inspect without loading plugin runtime. |
| `qaRunners` | No | `object[]` | Cheap QA runner descriptors used by the shared `openclaw qa` host before plugin runtime loads. |
| `contracts` | No | `object` | Static bundled capability snapshot for speech, realtime transcription, realtime voice, media-understanding, image-generation, music-generation, video-generation, web-fetch, web search, and tool ownership. |
| `mediaUnderstandingProviderMetadata` | No | `Record<string, object>` | Cheap media-understanding defaults for provider ids declared in `contracts.mediaUnderstandingProviders`. |
| `channelConfigs` | No | `Record<string, object>` | Manifest-owned channel config metadata merged into discovery and validation surfaces before runtime loads. |
| `skills` | No | `string[]` | Skill directories to load, relative to the plugin root. |
| `name` | No | `string` | Human-readable plugin name. |
| `description` | No | `string` | Short summary shown in plugin surfaces. |
| `version` | No | `string` | Informational plugin version. |
| `uiHints` | No | `Record<string, object>` | UI labels, placeholders, and sensitivity hints for config fields. |
## providerAuthChoices reference
@@ -408,6 +409,43 @@ Each list is optional:
| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. |
| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
## mediaUnderstandingProviderMetadata reference
Use `mediaUnderstandingProviderMetadata` when a media-understanding provider has
default models, auto-auth fallback priority, or native document support that
generic core helpers need before runtime loads. Keys must also be declared in
`contracts.mediaUnderstandingProviders`.
```json
{
"contracts": {
"mediaUnderstandingProviders": ["example"]
},
"mediaUnderstandingProviderMetadata": {
"example": {
"capabilities": ["image", "audio"],
"defaultModels": {
"image": "example-vision-latest",
"audio": "example-transcribe-latest"
},
"autoPriority": {
"image": 40
},
"nativeDocumentInputs": ["pdf"]
}
}
}
```
Each provider entry can include:
| Field | Type | What it means |
| ---------------------- | ----------------------------------- | ---------------------------------------------------------------------------- |
| `capabilities` | `("image" \| "audio" \| "video")[]` | Media capabilities exposed by this provider. |
| `defaultModels` | `Record<string, string>` | Capability-to-model defaults used when config does not specify a model. |
| `autoPriority` | `Record<string, number>` | Lower numbers sort earlier for automatic credential-based provider fallback. |
| `nativeDocumentInputs` | `"pdf"[]` | Native document inputs supported by the provider. |
## channelConfigs reference
Use `channelConfigs` when a channel plugin needs cheap config metadata before

View File

@@ -40,6 +40,18 @@
"contracts": {
"mediaUnderstandingProviders": ["anthropic"]
},
"mediaUnderstandingProviderMetadata": {
"anthropic": {
"capabilities": ["image"],
"defaultModels": {
"image": "claude-opus-4-7"
},
"autoPriority": {
"image": 20
},
"nativeDocumentInputs": ["pdf"]
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -7,6 +7,17 @@
"contracts": {
"mediaUnderstandingProviders": ["deepgram"]
},
"mediaUnderstandingProviderMetadata": {
"deepgram": {
"capabilities": ["audio"],
"defaultModels": {
"audio": "nova-3"
},
"autoPriority": {
"audio": 30
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -53,6 +53,22 @@
"videoGenerationProviders": ["google"],
"webSearchProviders": ["gemini"]
},
"mediaUnderstandingProviderMetadata": {
"google": {
"capabilities": ["image", "audio", "video"],
"defaultModels": {
"image": "gemini-3-flash-preview",
"audio": "gemini-3-flash-preview",
"video": "gemini-3-flash-preview"
},
"autoPriority": {
"image": 30,
"audio": 40,
"video": 10
},
"nativeDocumentInputs": ["pdf"]
}
},
"configContracts": {
"compatibilityRuntimePaths": ["tools.web.search.apiKey"]
},

View File

@@ -7,6 +7,17 @@
"contracts": {
"mediaUnderstandingProviders": ["groq"]
},
"mediaUnderstandingProviderMetadata": {
"groq": {
"capabilities": ["audio"],
"defaultModels": {
"audio": "whisper-large-v3-turbo"
},
"autoPriority": {
"audio": 20
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -72,6 +72,26 @@
"configContracts": {
"compatibilityRuntimePaths": ["tools.web.search.apiKey"]
},
"mediaUnderstandingProviderMetadata": {
"minimax": {
"capabilities": ["image"],
"defaultModels": {
"image": "MiniMax-VL-01"
},
"autoPriority": {
"image": 40
}
},
"minimax-portal": {
"capabilities": ["image"],
"defaultModels": {
"image": "MiniMax-VL-01"
},
"autoPriority": {
"image": 50
}
}
},
"uiHints": {
"webSearch.apiKey": {
"label": "MiniMax Coding Plan key",

View File

@@ -24,6 +24,17 @@
"memoryEmbeddingProviders": ["mistral"],
"mediaUnderstandingProviders": ["mistral"]
},
"mediaUnderstandingProviderMetadata": {
"mistral": {
"capabilities": ["audio"],
"defaultModels": {
"audio": "voxtral-mini-latest"
},
"autoPriority": {
"audio": 50
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -52,6 +52,18 @@
"mediaUnderstandingProviders": ["moonshot"],
"webSearchProviders": ["kimi"]
},
"mediaUnderstandingProviderMetadata": {
"moonshot": {
"capabilities": ["image", "video"],
"defaultModels": {
"image": "kimi-k2.6",
"video": "kimi-k2.6"
},
"autoPriority": {
"video": 20
}
}
},
"configContracts": {
"compatibilityRuntimePaths": ["tools.web.search.apiKey"]
},

View File

@@ -44,6 +44,25 @@
"imageGenerationProviders": ["openai"],
"videoGenerationProviders": ["openai"]
},
"mediaUnderstandingProviderMetadata": {
"openai": {
"capabilities": ["image", "audio"],
"defaultModels": {
"image": "gpt-5.4-mini",
"audio": "gpt-4o-transcribe"
},
"autoPriority": {
"image": 10,
"audio": 10
}
},
"openai-codex": {
"capabilities": ["image"],
"defaultModels": {
"image": "gpt-5.4"
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -23,6 +23,14 @@
"contracts": {
"mediaUnderstandingProviders": ["openrouter"]
},
"mediaUnderstandingProviderMetadata": {
"openrouter": {
"capabilities": ["image"],
"defaultModels": {
"image": "auto"
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -6,6 +6,18 @@
"mediaUnderstandingProviders": ["qwen"],
"videoGenerationProviders": ["qwen"]
},
"mediaUnderstandingProviderMetadata": {
"qwen": {
"capabilities": ["image", "video"],
"defaultModels": {
"image": "qwen-vl-max-latest",
"video": "qwen-vl-max-latest"
},
"autoPriority": {
"video": 15
}
}
},
"providerAuthEnvVars": {
"qwen": ["QWEN_API_KEY", "MODELSTUDIO_API_KEY", "DASHSCOPE_API_KEY"]
},

View File

@@ -79,6 +79,17 @@
"contracts": {
"mediaUnderstandingProviders": ["zai"]
},
"mediaUnderstandingProviderMetadata": {
"zai": {
"capabilities": ["image"],
"defaultModels": {
"image": "glm-4.6v"
},
"autoPriority": {
"image": 60
}
}
},
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -4,7 +4,7 @@ import {
resolveAgentModelPrimaryValue,
} from "../../config/model-input.js";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { bundledProviderSupportsNativePdfDocument } from "../../media-understanding/bundled-defaults.js";
import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
@@ -38,7 +38,7 @@ export function resolvePdfInputs(record: Record<string, unknown>): string[] {
* Check whether a provider supports native PDF document input.
*/
export function providerSupportsNativePdf(provider: string): boolean {
return bundledProviderSupportsNativePdfDocument(provider);
return providerSupportsNativePdfDocument({ providerId: provider });
}
/**

View File

@@ -1,9 +1,9 @@
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import {
bundledProviderSupportsNativePdfDocument,
resolveBundledAutoMediaKeyProviders,
resolveBundledDefaultMediaModel,
} from "../../media-understanding/bundled-defaults.js";
providerSupportsNativePdfDocument,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import {
coerceImageModelConfig,
type ImageModelConfig,
@@ -12,12 +12,12 @@ import {
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
import { coercePdfModelConfig } from "./pdf-tool.helpers.js";
function resolveBundledImageCandidateRefs(params: {
function resolveImageCandidateRefs(params: {
cfg?: OpenClawConfig;
agentDir: string;
filter?: (providerId: string) => boolean;
}): string[] {
return resolveBundledAutoMediaKeyProviders("image")
return resolveAutoMediaKeyProviders({ capability: "image", cfg: params.cfg })
.filter((providerId) => !params.filter || params.filter(providerId))
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
@@ -26,7 +26,8 @@ function resolveBundledImageCandidateRefs(params: {
cfg: params.cfg,
provider: providerId,
})?.split("/")[1] ??
resolveBundledDefaultMediaModel({
resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
@@ -69,17 +70,21 @@ export function resolvePdfModelConfigForTool(params: {
});
const providerDefault =
providerVision?.split("/")[1] ??
resolveBundledDefaultMediaModel({
resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
const primarySupportsNativePdf = bundledProviderSupportsNativePdfDocument(primary.provider);
const nativePdfCandidates = resolveBundledImageCandidateRefs({
const primarySupportsNativePdf = providerSupportsNativePdfDocument({
cfg: params.cfg,
providerId: primary.provider,
});
const nativePdfCandidates = resolveImageCandidateRefs({
cfg: params.cfg,
agentDir: params.agentDir,
filter: bundledProviderSupportsNativePdfDocument,
filter: (providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }),
});
const genericImageCandidates = resolveBundledImageCandidateRefs({
const genericImageCandidates = resolveImageCandidateRefs({
cfg: params.cfg,
agentDir: params.agentDir,
});

View File

@@ -1,109 +0,0 @@
import { normalizeMediaProviderId } from "./provider-id.js";
import type { MediaUnderstandingCapability } from "./types.js";
type BundledMediaProviderDefaults = {
defaultModels?: Partial<Record<MediaUnderstandingCapability, string>>;
autoPriority?: Partial<Record<MediaUnderstandingCapability, number>>;
nativeDocumentInputs?: Array<"pdf">;
};
const BUNDLED_MEDIA_PROVIDER_DEFAULTS: Record<string, BundledMediaProviderDefaults> = {
openai: {
defaultModels: { image: "gpt-5.4-mini", audio: "gpt-4o-transcribe" },
autoPriority: { image: 10, audio: 10 },
},
"openai-codex": {
defaultModels: { image: "gpt-5.4" },
},
anthropic: {
defaultModels: { image: "claude-opus-4-7" },
autoPriority: { image: 20 },
nativeDocumentInputs: ["pdf"],
},
google: {
defaultModels: {
image: "gemini-3-flash-preview",
audio: "gemini-3-flash-preview",
video: "gemini-3-flash-preview",
},
autoPriority: { image: 30, audio: 40, video: 10 },
nativeDocumentInputs: ["pdf"],
},
groq: {
defaultModels: { audio: "whisper-large-v3-turbo" },
autoPriority: { audio: 20 },
},
deepgram: {
defaultModels: { audio: "nova-3" },
autoPriority: { audio: 30 },
},
mistral: {
defaultModels: { audio: "voxtral-mini-latest" },
autoPriority: { audio: 50 },
},
minimax: {
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 40 },
},
"minimax-portal": {
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 50 },
},
zai: {
defaultModels: { image: "glm-4.6v" },
autoPriority: { image: 60 },
},
qwen: {
defaultModels: { image: "qwen-vl-max-latest", video: "qwen-vl-max-latest" },
autoPriority: { video: 15 },
},
moonshot: {
defaultModels: { image: "kimi-k2.6", video: "kimi-k2.6" },
autoPriority: { video: 20 },
},
openrouter: {
defaultModels: { image: "auto" },
},
};
export function getBundledMediaProviderDefaults(
providerId: string,
): BundledMediaProviderDefaults | null {
return BUNDLED_MEDIA_PROVIDER_DEFAULTS[normalizeMediaProviderId(providerId)] ?? null;
}
export function resolveBundledDefaultMediaModel(params: {
providerId: string;
capability: MediaUnderstandingCapability;
}): string | undefined {
return getBundledMediaProviderDefaults(params.providerId)?.defaultModels?.[
params.capability
]?.trim();
}
export function resolveBundledAutoMediaKeyProviders(
capability: MediaUnderstandingCapability,
): string[] {
return Object.entries(BUNDLED_MEDIA_PROVIDER_DEFAULTS)
.map(([providerId, defaults]) => ({
providerId,
priority: defaults.autoPriority?.[capability],
}))
.filter(
(entry): entry is { providerId: string; priority: number } =>
typeof entry.priority === "number",
)
.toSorted((left, right) => {
if (left.priority !== right.priority) {
return left.priority - right.priority;
}
return left.providerId.localeCompare(right.providerId);
})
.map((entry) => entry.providerId);
}
export function bundledProviderSupportsNativePdfDocument(providerId: string): boolean {
return (
getBundledMediaProviderDefaults(providerId)?.nativeDocumentInputs?.includes("pdf") ?? false
);
}

View File

@@ -1,11 +1,7 @@
import type { OpenClawConfig } from "../config/types.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
import {
bundledProviderSupportsNativePdfDocument,
resolveBundledAutoMediaKeyProviders,
resolveBundledDefaultMediaModel,
} from "./bundled-defaults.js";
import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js";
import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js";
import { normalizeMediaProviderId } from "./provider-registry.js";
import { providerSupportsCapability } from "./provider-supports.js";
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
@@ -39,8 +35,30 @@ export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;
let defaultRegistryCache: Map<string, MediaUnderstandingProvider> | null = null;
const configRegistryCache = new WeakMap<OpenClawConfig, Map<string, MediaUnderstandingProvider>>();
function resolveDefaultRegistry(cfg?: OpenClawConfig) {
return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig));
if (!cfg) {
defaultRegistryCache ??= buildMediaUnderstandingManifestMetadataRegistry();
return defaultRegistryCache;
}
const cached = configRegistryCache.get(cfg);
if (cached) {
return cached;
}
const registry = buildMediaUnderstandingManifestMetadataRegistry(cfg);
configRegistryCache.set(cfg, registry);
return registry;
}
function providerHasDeclaredCapability(
provider: MediaUnderstandingProvider | undefined,
capability: MediaUnderstandingCapability,
): boolean {
return (
provider?.capabilities?.includes(capability) ?? providerSupportsCapability(provider, capability)
);
}
function resolveConfiguredImageProviderModel(params: {
@@ -68,6 +86,28 @@ function resolveConfiguredImageProviderModel(params: {
return undefined;
}
function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
const providers = cfg?.models?.providers;
if (!providers || typeof providers !== "object") {
return [];
}
const configured: string[] = [];
for (const [providerKey, providerCfg] of Object.entries(providers)) {
const normalizedProviderId = normalizeMediaProviderId(providerKey);
if (!normalizedProviderId || configured.includes(normalizedProviderId)) {
continue;
}
const models = providerCfg?.models ?? [];
const hasImageModel = models.some(
(model) => Array.isArray(model?.input) && model.input.includes("image"),
);
if (hasImageModel) {
configured.push(normalizedProviderId);
}
}
return configured;
}
export function resolveDefaultMediaModel(params: {
providerId: string;
capability: MediaUnderstandingCapability;
@@ -85,13 +125,6 @@ export function resolveDefaultMediaModel(params: {
if (configuredImageModel) {
return configuredImageModel;
}
const bundledDefault = resolveBundledDefaultMediaModel({
providerId: params.providerId,
capability: params.capability,
});
if (bundledDefault) {
return bundledDefault;
}
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
@@ -103,35 +136,13 @@ export function resolveAutoMediaKeyProviders(params: {
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): string[] {
if (!params.providerRegistry) {
const bundledProviders = resolveBundledAutoMediaKeyProviders(params.capability);
if (params.capability !== "image") {
return bundledProviders;
}
const configProviders = params.cfg?.models?.providers;
if (!configProviders || typeof configProviders !== "object") {
return bundledProviders;
}
const merged = [...bundledProviders];
for (const [providerKey, providerCfg] of Object.entries(configProviders)) {
const normalizedProviderId = normalizeMediaProviderId(providerKey);
const models = providerCfg?.models ?? [];
const hasImageModel = models.some(
(model) => Array.isArray(model?.input) && model.input.includes("image"),
);
if (hasImageModel && !merged.includes(normalizedProviderId)) {
merged.push(normalizedProviderId);
}
}
return merged;
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
type AutoProviderEntry = {
provider: MediaUnderstandingProvider;
priority: number;
};
return [...registry.values()]
.filter((provider) => providerSupportsCapability(provider, params.capability))
const prioritized = [...registry.values()]
.filter((provider) => providerHasDeclaredCapability(provider, params.capability))
.map((provider): AutoProviderEntry | null => {
const priority = provider.autoPriority?.[params.capability];
return typeof priority === "number" && Number.isFinite(priority)
@@ -147,6 +158,10 @@ export function resolveAutoMediaKeyProviders(params: {
})
.map((entry) => normalizeMediaProviderId(entry.provider.id))
.filter(Boolean);
if (params.providerRegistry || params.capability !== "image") {
return prioritized;
}
return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])];
}
export function providerSupportsNativePdfDocument(params: {
@@ -154,9 +169,6 @@ export function providerSupportsNativePdfDocument(params: {
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): boolean {
if (!params.providerRegistry && bundledProviderSupportsNativePdfDocument(params.providerId)) {
return true;
}
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
return provider?.nativeDocumentInputs?.includes("pdf") ?? false;

View File

@@ -0,0 +1,36 @@
import type { OpenClawConfig } from "../config/types.js";
import { loadPluginManifestRegistry } from "../plugins/manifest-registry.js";
import { normalizeMediaProviderId } from "./provider-id.js";
import type { MediaUnderstandingProvider } from "./types.js";
export function buildMediaUnderstandingManifestMetadataRegistry(
cfg?: OpenClawConfig,
): Map<string, MediaUnderstandingProvider> {
const registry = new Map<string, MediaUnderstandingProvider>();
for (const plugin of loadPluginManifestRegistry({
config: cfg,
env: process.env,
}).plugins) {
const declaredProviders = new Set(
(plugin.contracts?.mediaUnderstandingProviders ?? []).map((providerId) =>
normalizeMediaProviderId(providerId),
),
);
for (const [providerId, metadata] of Object.entries(
plugin.mediaUnderstandingProviderMetadata ?? {},
)) {
const normalizedProviderId = normalizeMediaProviderId(providerId);
if (!normalizedProviderId || !declaredProviders.has(normalizedProviderId)) {
continue;
}
registry.set(normalizedProviderId, {
id: normalizedProviderId,
capabilities: metadata.capabilities,
defaultModels: metadata.defaultModels,
autoPriority: metadata.autoPriority,
nativeDocumentInputs: metadata.nativeDocumentInputs,
});
}
}
return registry;
}

View File

@@ -532,6 +532,54 @@ describe("loadPluginManifestRegistry", () => {
});
});
it("preserves media-understanding provider metadata from plugin manifests", () => {
const dir = makeTempDir();
writeManifest(dir, {
id: "openai",
contracts: {
mediaUnderstandingProviders: ["openai"],
},
mediaUnderstandingProviderMetadata: {
openai: {
capabilities: ["image", "audio", "unknown"],
defaultModels: {
image: "gpt-5.4-mini",
audio: "gpt-4o-transcribe",
unknown: "ignored",
},
autoPriority: {
image: 10,
audio: 20,
video: "ignored",
},
nativeDocumentInputs: ["pdf", "docx"],
},
},
configSchema: { type: "object" },
});
const registry = loadSingleCandidateRegistry({
idHint: "openai",
rootDir: dir,
origin: "bundled",
});
expect(registry.plugins[0]?.mediaUnderstandingProviderMetadata).toEqual({
openai: {
capabilities: ["image", "audio"],
defaultModels: {
image: "gpt-5.4-mini",
audio: "gpt-4o-transcribe",
},
autoPriority: {
image: 10,
audio: 20,
},
nativeDocumentInputs: ["pdf"],
},
});
});
it("preserves channel env metadata from plugin manifests", () => {
const dir = makeTempDir();
writeManifest(dir, {

View File

@@ -33,6 +33,7 @@ import {
type PluginManifest,
type PluginManifestChannelConfig,
type PluginManifestContracts,
type PluginManifestMediaUnderstandingProviderMetadata,
type PluginManifestModelSupport,
type PluginManifestProviderEndpoint,
type PluginManifestQaRunner,
@@ -112,6 +113,10 @@ export type PluginManifestRecord = {
configSchema?: Record<string, unknown>;
configUiHints?: Record<string, PluginConfigUiHint>;
contracts?: PluginManifestContracts;
mediaUnderstandingProviderMetadata?: Record<
string,
PluginManifestMediaUnderstandingProviderMetadata
>;
configContracts?: PluginManifestConfigContracts;
channelConfigs?: Record<string, PluginManifestChannelConfig>;
channelCatalogMeta?: {
@@ -359,6 +364,7 @@ function buildRecord(params: {
configSchema: params.configSchema,
configUiHints: params.manifest.uiHints,
contracts: params.manifest.contracts,
mediaUnderstandingProviderMetadata: params.manifest.mediaUnderstandingProviderMetadata,
configContracts: params.manifest.configContracts,
channelConfigs,
...(params.candidate.packageManifest?.channel?.id

View File

@@ -219,6 +219,11 @@ export type PluginManifest = {
* compat wiring, and contract coverage without importing plugin runtime.
*/
contracts?: PluginManifestContracts;
/** Cheap media-understanding provider defaults without importing plugin runtime. */
mediaUnderstandingProviderMetadata?: Record<
string,
PluginManifestMediaUnderstandingProviderMetadata
>;
/** Manifest-owned config behavior consumed by generic core helpers. */
configContracts?: PluginManifestConfigContracts;
channelConfigs?: Record<string, PluginManifestChannelConfig>;
@@ -238,6 +243,15 @@ export type PluginManifestContracts = {
tools?: string[];
};
export type PluginManifestMediaUnderstandingCapability = "image" | "audio" | "video";
export type PluginManifestMediaUnderstandingProviderMetadata = {
capabilities?: PluginManifestMediaUnderstandingCapability[];
defaultModels?: Partial<Record<PluginManifestMediaUnderstandingCapability, string>>;
autoPriority?: Partial<Record<PluginManifestMediaUnderstandingCapability, number>>;
nativeDocumentInputs?: Array<"pdf">;
};
export type PluginManifestProviderAuthChoice = {
/** Provider id owned by this manifest entry. */
provider: string;
@@ -311,6 +325,92 @@ function normalizeStringRecord(value: unknown): Record<string, string> | undefin
return Object.keys(normalized).length > 0 ? normalized : undefined;
}
const MEDIA_UNDERSTANDING_CAPABILITIES = new Set(["image", "audio", "video"]);
function normalizeMediaUnderstandingCapabilityRecord(
value: unknown,
): Partial<Record<PluginManifestMediaUnderstandingCapability, string>> | undefined {
if (!isRecord(value)) {
return undefined;
}
const normalized: Partial<Record<PluginManifestMediaUnderstandingCapability, string>> = {};
for (const [rawKey, rawValue] of Object.entries(value)) {
if (!MEDIA_UNDERSTANDING_CAPABILITIES.has(rawKey)) {
continue;
}
const model = normalizeOptionalString(rawValue);
if (model) {
normalized[rawKey as PluginManifestMediaUnderstandingCapability] = model;
}
}
return Object.keys(normalized).length > 0 ? normalized : undefined;
}
function normalizeMediaUnderstandingPriorityRecord(
value: unknown,
): Partial<Record<PluginManifestMediaUnderstandingCapability, number>> | undefined {
if (!isRecord(value)) {
return undefined;
}
const normalized: Partial<Record<PluginManifestMediaUnderstandingCapability, number>> = {};
for (const [rawKey, rawValue] of Object.entries(value)) {
if (
!MEDIA_UNDERSTANDING_CAPABILITIES.has(rawKey) ||
typeof rawValue !== "number" ||
!Number.isFinite(rawValue)
) {
continue;
}
normalized[rawKey as PluginManifestMediaUnderstandingCapability] = rawValue;
}
return Object.keys(normalized).length > 0 ? normalized : undefined;
}
function normalizeMediaUnderstandingCapabilities(
value: unknown,
): PluginManifestMediaUnderstandingCapability[] | undefined {
const values = normalizeTrimmedStringList(value).filter((entry) =>
MEDIA_UNDERSTANDING_CAPABILITIES.has(entry),
) as PluginManifestMediaUnderstandingCapability[];
return values.length > 0 ? values : undefined;
}
function normalizeMediaUnderstandingNativeDocumentInputs(value: unknown): Array<"pdf"> | undefined {
const values = normalizeTrimmedStringList(value).filter((entry) => entry === "pdf");
return values.length > 0 ? values : undefined;
}
function normalizeMediaUnderstandingProviderMetadata(
value: unknown,
): Record<string, PluginManifestMediaUnderstandingProviderMetadata> | undefined {
if (!isRecord(value)) {
return undefined;
}
const normalized: Record<string, PluginManifestMediaUnderstandingProviderMetadata> = {};
for (const [rawProviderId, rawMetadata] of Object.entries(value)) {
const providerId = normalizeOptionalString(rawProviderId) ?? "";
if (!providerId || !isRecord(rawMetadata)) {
continue;
}
const capabilities = normalizeMediaUnderstandingCapabilities(rawMetadata.capabilities);
const defaultModels = normalizeMediaUnderstandingCapabilityRecord(rawMetadata.defaultModels);
const autoPriority = normalizeMediaUnderstandingPriorityRecord(rawMetadata.autoPriority);
const nativeDocumentInputs = normalizeMediaUnderstandingNativeDocumentInputs(
rawMetadata.nativeDocumentInputs,
);
const metadata = {
...(capabilities ? { capabilities } : {}),
...(defaultModels ? { defaultModels } : {}),
...(autoPriority ? { autoPriority } : {}),
...(nativeDocumentInputs ? { nativeDocumentInputs } : {}),
} satisfies PluginManifestMediaUnderstandingProviderMetadata;
if (Object.keys(metadata).length > 0) {
normalized[providerId] = metadata;
}
}
return Object.keys(normalized).length > 0 ? normalized : undefined;
}
function normalizeManifestContracts(value: unknown): PluginManifestContracts | undefined {
if (!isRecord(value)) {
return undefined;
@@ -769,6 +869,9 @@ export function loadPluginManifest(
const qaRunners = normalizeManifestQaRunners(raw.qaRunners);
const skills = normalizeTrimmedStringList(raw.skills);
const contracts = normalizeManifestContracts(raw.contracts);
const mediaUnderstandingProviderMetadata = normalizeMediaUnderstandingProviderMetadata(
raw.mediaUnderstandingProviderMetadata,
);
const configContracts = normalizeManifestConfigContracts(raw.configContracts);
const channelConfigs = normalizeChannelConfigs(raw.channelConfigs);
@@ -810,6 +913,7 @@ export function loadPluginManifest(
version,
uiHints,
contracts,
mediaUnderstandingProviderMetadata,
configContracts,
channelConfigs,
},

View File

@@ -54,6 +54,13 @@ const CORE_SECRET_SURFACE_GUARDS = [
path: "src/gateway/channel-health-policy.ts",
forbiddenPatterns: [/\btelegram\b/],
},
{
path: "src/media-understanding/defaults.ts",
forbiddenPatterns: [
/\b(?:openai|anthropic|google|groq|deepgram|mistral|minimax|zai|qwen|moonshot|openrouter)\b/,
/\b(?:gpt-|claude-|gemini-|whisper-|nova-|voxtral-|MiniMax-|glm-|qwen-|kimi-)\b/,
],
},
] as const;
describe("channel secret contract surface guardrails", () => {