From 50069bcb59dcfa219ccf31511391caf6bad375ae Mon Sep 17 00:00:00 2001 From: Shakker Date: Wed, 1 Apr 2026 20:11:48 +0100 Subject: [PATCH] fix: guard media image auto model resolution --- src/media-understanding/defaults.test.ts | 6 + src/media-understanding/defaults.ts | 3 + src/media-understanding/runner.ts | 95 ++++++++++++++-- .../runner.vision-skip.test.ts | 103 +++++++++++++++++- 4 files changed, 197 insertions(+), 10 deletions(-) diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts index 1670d4bdf6a..f7ccedfb85e 100644 --- a/src/media-understanding/defaults.test.ts +++ b/src/media-understanding/defaults.test.ts @@ -35,4 +35,10 @@ describe("DEFAULT_IMAGE_MODELS", () => { it("includes the MiniMax portal vision default", () => { expect(DEFAULT_IMAGE_MODELS["minimax-portal"]).toBe("MiniMax-VL-01"); }); + + it("includes bundled image-provider defaults beyond the core provider set", () => { + expect(DEFAULT_IMAGE_MODELS["openai-codex"]).toBe("gpt-5.4"); + expect(DEFAULT_IMAGE_MODELS.moonshot).toBe("kimi-k2.5"); + expect(DEFAULT_IMAGE_MODELS.openrouter).toBe("auto"); + }); }); diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index a7c0d76d021..b92dba7ed1c 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -52,10 +52,13 @@ export const AUTO_IMAGE_KEY_PROVIDERS = [ export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const; export const DEFAULT_IMAGE_MODELS: Record = { openai: "gpt-5-mini", + "openai-codex": "gpt-5.4", anthropic: "claude-opus-4-6", google: "gemini-3-flash-preview", minimax: "MiniMax-VL-01", "minimax-portal": "MiniMax-VL-01", + moonshot: "kimi-k2.5", + openrouter: "auto", zai: "glm-4.6v", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index a5caf4cbcd0..03050fed066 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -8,6 +8,7 @@ import { loadModelCatalog, modelSupportsVision, } from "../agents/model-catalog.js"; +import { findNormalizedProviderValue } from "../agents/provider-id.js"; import type { MsgContext } from "../auto-reply/templating.js"; import type { OpenClawConfig } from "../config/config.js"; import { @@ -102,6 +103,68 @@ function resolveConfiguredKeyProviderOrder(params: { return [...new Set([...configuredProviders, ...params.fallbackProviders])]; } +function resolveConfiguredImageModelId(params: { + cfg: OpenClawConfig; + providerId: string; +}): string | undefined { + const providerCfg = findNormalizedProviderValue( + params.cfg.models?.providers, + params.providerId, + ) as + | { + models?: Array<{ + id?: string; + input?: string[]; + }>; + } + | undefined; + const configured = providerCfg?.models?.find((entry) => { + const id = entry?.id?.trim(); + return Boolean(id) && entry?.input?.includes("image"); + }); + const id = configured?.id?.trim(); + return id || undefined; +} + +function resolveCatalogImageModelId(params: { + providerId: string; + catalog: Awaited>; +}): string | undefined { + const matches = params.catalog.filter( + (entry) => + normalizeMediaProviderId(entry.provider) === params.providerId && modelSupportsVision(entry), + ); + if (matches.length === 0) { + return undefined; + } + const autoEntry = matches.find((entry) => entry.id.trim().toLowerCase() === "auto"); + return (autoEntry ?? matches[0])?.id.trim() || undefined; +} + +async function resolveAutoImageModelId(params: { + cfg: OpenClawConfig; + providerId: string; + explicitModel?: string; +}): Promise { + const explicit = params.explicitModel?.trim(); + if (explicit) { + return explicit; + } + const configuredModel = resolveConfiguredImageModelId(params); + if (configuredModel) { + return configuredModel; + } + const defaultModel = DEFAULT_IMAGE_MODELS[params.providerId]; + if (defaultModel) { + return defaultModel; + } + const catalog = await loadModelCatalog({ config: params.cfg }); + return resolveCatalogImageModelId({ + providerId: params.providerId, + catalog, + }); +} + export function buildProviderRegistry( overrides?: Record, cfg?: OpenClawConfig, @@ -390,7 +453,14 @@ async function resolveKeyEntry(params: { ) { return null; } - return { type: "provider" as const, provider: providerId, model }; + const resolvedModel = + capability === "image" + ? await resolveAutoImageModelId({ cfg, providerId, explicitModel: model }) + : model; + if (capability === "image" && !resolvedModel) { + return null; + } + return { type: "provider" as const, provider: providerId, model: resolvedModel }; }; if (capability === "image") { @@ -407,8 +477,7 @@ async function resolveKeyEntry(params: { capability, fallbackProviders: AUTO_IMAGE_KEY_PROVIDERS, })) { - const model = DEFAULT_IMAGE_MODELS[providerId]; - const entry = await checkProvider(providerId, model); + const entry = await checkProvider(providerId); if (entry) { return entry; } @@ -533,11 +602,8 @@ export async function resolveAutoImageModel(params: { return null; } const provider = entry.provider; - if (!provider) { - return null; - } - const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider]; - if (!model) { + const model = entry.model?.trim(); + if (!provider || !model) { return null; } return { provider, model }; @@ -599,10 +665,21 @@ async function resolveActiveModelEntry(params: { if (!hasAuth) { return null; } + const model = + params.capability === "image" + ? await resolveAutoImageModelId({ + cfg: params.cfg, + providerId, + explicitModel: params.activeModel?.model, + }) + : params.activeModel?.model; + if (params.capability === "image" && !model) { + return null; + } return { type: "provider", provider: providerId, - model: params.activeModel?.model, + model, }; } diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts index ba227510737..398832fda0d 100644 --- a/src/media-understanding/runner.vision-skip.test.ts +++ b/src/media-understanding/runner.vision-skip.test.ts @@ -11,8 +11,9 @@ import { loadPluginManifestRegistry } from "../plugins/manifest-registry.js"; import { createEmptyPluginRegistry } from "../plugins/registry.js"; import { setActivePluginRegistry } from "../plugins/runtime.js"; import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js"; +import { withMediaFixture } from "./runner.test-utils.js"; -const catalog = [ +const baseCatalog = [ { id: "gpt-4.1", name: "GPT-4.1", @@ -20,6 +21,7 @@ const catalog = [ input: ["text", "image"] as const, }, ]; +let catalog = [...baseCatalog]; const loadModelCatalog = vi.hoisted(() => vi.fn(async () => catalog)); @@ -85,6 +87,7 @@ describe("runCapability image skip", () => { }); beforeEach(() => { + catalog = [...baseCatalog]; loadModelCatalog.mockClear(); setActivePluginRegistry(createEmptyPluginRegistry()); vi.unstubAllEnvs(); @@ -150,4 +153,102 @@ describe("runCapability image skip", () => { vi.unstubAllEnvs(); } }); + + it("auto-selects configured OpenRouter image providers with a resolved model", async () => { + let seenModel: string | undefined; + await withMediaFixture( + { + filePrefix: "openclaw-image-openrouter", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const cfg = { + models: { + providers: { + openrouter: { + apiKey: "test-openrouter-key", // pragma: allowlist secret + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: new Map([ + [ + "openrouter", + { + id: "openrouter", + capabilities: ["image"], + describeImage: async (req) => { + seenModel = req.model; + return { text: "openrouter ok", model: req.model }; + }, + }, + ], + ]), + }); + + expect(result.decision.outcome).toBe("success"); + expect(result.outputs[0]?.provider).toBe("openrouter"); + expect(result.outputs[0]?.model).toBe("auto"); + expect(result.outputs[0]?.text).toBe("openrouter ok"); + expect(seenModel).toBe("auto"); + }, + ); + }); + + it("skips configured image providers without an auto-resolvable model", async () => { + await withMediaFixture( + { + filePrefix: "openclaw-image-custom-skip", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const cfg = { + models: { + providers: { + "custom-image": { + apiKey: "test-custom-key", // pragma: allowlist secret + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: new Map([ + [ + "custom-image", + { + id: "custom-image", + capabilities: ["image"], + describeImage: async () => ({ text: "custom ok" }), + }, + ], + ]), + }); + + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("skipped"); + expect(result.decision.attachments).toEqual([{ attachmentIndex: 0, attempts: [] }]); + }, + ); + }); });