fix(config): accept video and audio model inputs

Preserve configured audio/video model input modalities through provider catalog normalization.\n\nFixes #20721.\nThanks @alvinttang.
This commit is contained in:
Alvin Tang
2026-04-26 12:18:54 +08:00
committed by GitHub
parent f1eef47839
commit 4428661779
10 changed files with 70 additions and 8 deletions

View File

@@ -74,6 +74,8 @@ Docs: https://docs.openclaw.ai
- Channels/status: keep read-only channel lists on manifest and package metadata by default, loading setup runtime only for explicit fallback callers. Thanks @shakkernerd. - Channels/status: keep read-only channel lists on manifest and package metadata by default, loading setup runtime only for explicit fallback callers. Thanks @shakkernerd.
- Plugins/onboarding: defer onboarding install-record index writes until the guarded config commit so setup failures cannot leave the plugin index ahead of `openclaw.json`. Thanks @shakkernerd. - Plugins/onboarding: defer onboarding install-record index writes until the guarded config commit so setup failures cannot leave the plugin index ahead of `openclaw.json`. Thanks @shakkernerd.
- Plugins/registry: resolve web provider ownership from the installed plugin index instead of broad manifest scans on secret, tool, and pricing paths. Thanks @shakkernerd. - Plugins/registry: resolve web provider ownership from the installed plugin index instead of broad manifest scans on secret, tool, and pricing paths. Thanks @shakkernerd.
- Config/providers: accept `video` and `audio` in configured model `input` values and
preserve them in provider catalog entries. Fixes #20721. Thanks @alvinttang.
- TTS: strip model-emitted TTS directives from streamed block text before channel - TTS: strip model-emitted TTS directives from streamed block text before channel
delivery, including directives split across adjacent blocks, while preserving delivery, including directives split across adjacent blocks, while preserving
the accumulated raw reply for final-mode synthesis. Fixes #38937. the accumulated raw reply for final-mode synthesis. Fixes #38937.

View File

@@ -269,7 +269,7 @@ export type LmstudioModelBase = {
trainedForToolUse: boolean; trainedForToolUse: boolean;
loaded: boolean; loaded: boolean;
reasoning: boolean; reasoning: boolean;
input: ModelDefinitionConfig["input"]; input: Array<"text" | "image">;
cost: ModelDefinitionConfig["cost"]; cost: ModelDefinitionConfig["cost"];
contextWindow: number; contextWindow: number;
contextTokens: number; contextTokens: number;

View File

@@ -822,6 +822,9 @@ export async function prepareLmstudioDynamicModels(
provider: PROVIDER_ID, provider: PROVIDER_ID,
api: ctx.providerConfig?.api ?? `openai-completions`, api: ctx.providerConfig?.api ?? `openai-completions`,
baseUrl, baseUrl,
input: model.input.filter(
(entry): entry is "text" | "image" => entry === "text" || entry === "image",
),
}), }),
); );
} }

View File

@@ -1,4 +1,4 @@
export type ModelInputType = "text" | "image" | "document"; export type ModelInputType = "text" | "image" | "audio" | "video" | "document";
export type ModelCatalogEntry = { export type ModelCatalogEntry = {
id: string; id: string;

View File

@@ -2908,6 +2908,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
type: "string", type: "string",
const: "image", const: "image",
}, },
{
type: "string",
const: "video",
},
{
type: "string",
const: "audio",
},
], ],
}, },
}, },

View File

@@ -80,7 +80,7 @@ export type ModelDefinitionConfig = {
api?: ModelApi; api?: ModelApi;
baseUrl?: string; baseUrl?: string;
reasoning: boolean; reasoning: boolean;
input: Array<"text" | "image">; input: Array<"text" | "image" | "video" | "audio">;
cost: { cost: {
input: number; input: number;
output: number; output: number;

View File

@@ -312,7 +312,11 @@ export const ModelDefinitionSchema = z
api: ModelApiSchema.optional(), api: ModelApiSchema.optional(),
baseUrl: z.string().min(1).optional(), baseUrl: z.string().min(1).optional(),
reasoning: z.boolean().optional(), reasoning: z.boolean().optional(),
input: z.array(z.union([z.literal("text"), z.literal("image")])).optional(), input: z
.array(
z.union([z.literal("text"), z.literal("image"), z.literal("video"), z.literal("audio")]),
)
.optional(),
cost: z cost: z
.object({ .object({
input: z.number().optional(), input: z.number().optional(),

View File

@@ -41,7 +41,7 @@ export type LmstudioModelBase = {
trainedForToolUse: boolean; trainedForToolUse: boolean;
loaded: boolean; loaded: boolean;
reasoning: boolean; reasoning: boolean;
input: ModelDefinitionConfig["input"]; input: Array<"text" | "image">;
cost: ModelDefinitionConfig["cost"]; cost: ModelDefinitionConfig["cost"];
contextWindow: number; contextWindow: number;
contextTokens: number; contextTokens: number;

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest"; import { describe, expect, it } from "vitest";
import { import {
applyProviderNativeStreamingUsageCompat, applyProviderNativeStreamingUsageCompat,
readConfiguredProviderCatalogEntries,
supportsNativeStreamingUsageCompat, supportsNativeStreamingUsageCompat,
} from "./provider-catalog-shared.js"; } from "./provider-catalog-shared.js";
import type { ModelDefinitionConfig } from "./provider-model-shared.js"; import type { ModelDefinitionConfig } from "./provider-model-shared.js";
@@ -54,3 +55,43 @@ describe("provider-catalog-shared native streaming usage compat", () => {
expect(provider.models?.[1]?.compat?.supportsUsageInStreaming).toBe(false); expect(provider.models?.[1]?.compat?.supportsUsageInStreaming).toBe(false);
}); });
}); });
describe("provider-catalog-shared configured catalog entries", () => {
it("preserves configured audio and video input modalities", () => {
expect(
readConfiguredProviderCatalogEntries({
providerId: "kilocode",
config: {
models: {
providers: {
kilocode: {
baseUrl: "https://api.kilo.ai/api/gateway/",
api: "openai-completions",
models: [
{
id: "google/gemini-3-pro-preview",
name: "Gemini 3 Pro Preview",
input: ["text", "image", "video", "audio"],
reasoning: true,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1048576,
maxTokens: 65536,
},
],
},
},
},
},
}),
).toEqual([
{
provider: "kilocode",
id: "google/gemini-3-pro-preview",
name: "Gemini 3 Pro Preview",
input: ["text", "image", "video", "audio"],
reasoning: true,
contextWindow: 1048576,
},
]);
});
});

View File

@@ -23,7 +23,7 @@ export type ConfiguredProviderCatalogEntry = {
provider: string; provider: string;
contextWindow?: number; contextWindow?: number;
reasoning?: boolean; reasoning?: boolean;
input?: Array<"text" | "image" | "document">; input?: Array<"text" | "image" | "audio" | "video" | "document">;
}; };
function normalizeConfiguredCatalogModelInput( function normalizeConfiguredCatalogModelInput(
@@ -33,8 +33,12 @@ function normalizeConfiguredCatalogModelInput(
return undefined; return undefined;
} }
const normalized = input.filter( const normalized = input.filter(
(item): item is "text" | "image" | "document" => (item): item is "text" | "image" | "audio" | "video" | "document" =>
item === "text" || item === "image" || item === "document", item === "text" ||
item === "image" ||
item === "audio" ||
item === "video" ||
item === "document",
); );
return normalized.length > 0 ? normalized : undefined; return normalized.length > 0 ? normalized : undefined;
} }