From 44286617798ba9a256c62bbbdb5d11057053ea49 Mon Sep 17 00:00:00 2001 From: Alvin Tang Date: Sun, 26 Apr 2026 12:18:54 +0800 Subject: [PATCH] fix(config): accept video and audio model inputs Preserve configured audio/video model input modalities through provider catalog normalization.\n\nFixes #20721.\nThanks @alvinttang. --- CHANGELOG.md | 2 + extensions/lmstudio/src/models.ts | 2 +- extensions/lmstudio/src/setup.ts | 3 ++ src/agents/model-catalog.types.ts | 2 +- src/config/schema.base.generated.ts | 8 ++++ src/config/types.models.ts | 2 +- src/config/zod-schema.core.ts | 6 ++- src/plugin-sdk/lmstudio-runtime.ts | 2 +- .../provider-catalog-shared.test.ts | 41 +++++++++++++++++++ src/plugin-sdk/provider-catalog-shared.ts | 10 +++-- 10 files changed, 70 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 443246d93dd..ca226071daa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,8 @@ Docs: https://docs.openclaw.ai - Channels/status: keep read-only channel lists on manifest and package metadata by default, loading setup runtime only for explicit fallback callers. Thanks @shakkernerd. - Plugins/onboarding: defer onboarding install-record index writes until the guarded config commit so setup failures cannot leave the plugin index ahead of `openclaw.json`. Thanks @shakkernerd. - Plugins/registry: resolve web provider ownership from the installed plugin index instead of broad manifest scans on secret, tool, and pricing paths. Thanks @shakkernerd. +- Config/providers: accept `video` and `audio` in configured model `input` values and + preserve them in provider catalog entries. Fixes #20721. Thanks @alvinttang. - TTS: strip model-emitted TTS directives from streamed block text before channel delivery, including directives split across adjacent blocks, while preserving the accumulated raw reply for final-mode synthesis. Fixes #38937. diff --git a/extensions/lmstudio/src/models.ts b/extensions/lmstudio/src/models.ts index ef43b58ca86..3c45b12b679 100644 --- a/extensions/lmstudio/src/models.ts +++ b/extensions/lmstudio/src/models.ts @@ -269,7 +269,7 @@ export type LmstudioModelBase = { trainedForToolUse: boolean; loaded: boolean; reasoning: boolean; - input: ModelDefinitionConfig["input"]; + input: Array<"text" | "image">; cost: ModelDefinitionConfig["cost"]; contextWindow: number; contextTokens: number; diff --git a/extensions/lmstudio/src/setup.ts b/extensions/lmstudio/src/setup.ts index 31526f1eda7..7e56310d185 100644 --- a/extensions/lmstudio/src/setup.ts +++ b/extensions/lmstudio/src/setup.ts @@ -822,6 +822,9 @@ export async function prepareLmstudioDynamicModels( provider: PROVIDER_ID, api: ctx.providerConfig?.api ?? `openai-completions`, baseUrl, + input: model.input.filter( + (entry): entry is "text" | "image" => entry === "text" || entry === "image", + ), }), ); } diff --git a/src/agents/model-catalog.types.ts b/src/agents/model-catalog.types.ts index 6be158ea521..0c54d405695 100644 --- a/src/agents/model-catalog.types.ts +++ b/src/agents/model-catalog.types.ts @@ -1,4 +1,4 @@ -export type ModelInputType = "text" | "image" | "document"; +export type ModelInputType = "text" | "image" | "audio" | "video" | "document"; export type ModelCatalogEntry = { id: string; diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 870637b1562..f18efeca40f 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -2908,6 +2908,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { type: "string", const: "image", }, + { + type: "string", + const: "video", + }, + { + type: "string", + const: "audio", + }, ], }, }, diff --git a/src/config/types.models.ts b/src/config/types.models.ts index 5ad3ac1ead2..985d3f476b3 100644 --- a/src/config/types.models.ts +++ b/src/config/types.models.ts @@ -80,7 +80,7 @@ export type ModelDefinitionConfig = { api?: ModelApi; baseUrl?: string; reasoning: boolean; - input: Array<"text" | "image">; + input: Array<"text" | "image" | "video" | "audio">; cost: { input: number; output: number; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 0927b54a4b3..1b387910c88 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -312,7 +312,11 @@ export const ModelDefinitionSchema = z api: ModelApiSchema.optional(), baseUrl: z.string().min(1).optional(), reasoning: z.boolean().optional(), - input: z.array(z.union([z.literal("text"), z.literal("image")])).optional(), + input: z + .array( + z.union([z.literal("text"), z.literal("image"), z.literal("video"), z.literal("audio")]), + ) + .optional(), cost: z .object({ input: z.number().optional(), diff --git a/src/plugin-sdk/lmstudio-runtime.ts b/src/plugin-sdk/lmstudio-runtime.ts index 0898940e490..e8e2434cd0f 100644 --- a/src/plugin-sdk/lmstudio-runtime.ts +++ b/src/plugin-sdk/lmstudio-runtime.ts @@ -41,7 +41,7 @@ export type LmstudioModelBase = { trainedForToolUse: boolean; loaded: boolean; reasoning: boolean; - input: ModelDefinitionConfig["input"]; + input: Array<"text" | "image">; cost: ModelDefinitionConfig["cost"]; contextWindow: number; contextTokens: number; diff --git a/src/plugin-sdk/provider-catalog-shared.test.ts b/src/plugin-sdk/provider-catalog-shared.test.ts index 0f86c7c13f8..c36053b21ea 100644 --- a/src/plugin-sdk/provider-catalog-shared.test.ts +++ b/src/plugin-sdk/provider-catalog-shared.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { applyProviderNativeStreamingUsageCompat, + readConfiguredProviderCatalogEntries, supportsNativeStreamingUsageCompat, } from "./provider-catalog-shared.js"; import type { ModelDefinitionConfig } from "./provider-model-shared.js"; @@ -54,3 +55,43 @@ describe("provider-catalog-shared native streaming usage compat", () => { expect(provider.models?.[1]?.compat?.supportsUsageInStreaming).toBe(false); }); }); + +describe("provider-catalog-shared configured catalog entries", () => { + it("preserves configured audio and video input modalities", () => { + expect( + readConfiguredProviderCatalogEntries({ + providerId: "kilocode", + config: { + models: { + providers: { + kilocode: { + baseUrl: "https://api.kilo.ai/api/gateway/", + api: "openai-completions", + models: [ + { + id: "google/gemini-3-pro-preview", + name: "Gemini 3 Pro Preview", + input: ["text", "image", "video", "audio"], + reasoning: true, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 1048576, + maxTokens: 65536, + }, + ], + }, + }, + }, + }, + }), + ).toEqual([ + { + provider: "kilocode", + id: "google/gemini-3-pro-preview", + name: "Gemini 3 Pro Preview", + input: ["text", "image", "video", "audio"], + reasoning: true, + contextWindow: 1048576, + }, + ]); + }); +}); diff --git a/src/plugin-sdk/provider-catalog-shared.ts b/src/plugin-sdk/provider-catalog-shared.ts index b9aea6de842..968f3a7e9af 100644 --- a/src/plugin-sdk/provider-catalog-shared.ts +++ b/src/plugin-sdk/provider-catalog-shared.ts @@ -23,7 +23,7 @@ export type ConfiguredProviderCatalogEntry = { provider: string; contextWindow?: number; reasoning?: boolean; - input?: Array<"text" | "image" | "document">; + input?: Array<"text" | "image" | "audio" | "video" | "document">; }; function normalizeConfiguredCatalogModelInput( @@ -33,8 +33,12 @@ function normalizeConfiguredCatalogModelInput( return undefined; } const normalized = input.filter( - (item): item is "text" | "image" | "document" => - item === "text" || item === "image" || item === "document", + (item): item is "text" | "image" | "audio" | "video" | "document" => + item === "text" || + item === "image" || + item === "audio" || + item === "video" || + item === "document", ); return normalized.length > 0 ? normalized : undefined; }