From d268c850e61a55422ea335f5fbf8154a1fcf24ce Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 02:21:25 +0100 Subject: [PATCH] fix: honor explicit media image model routing --- CHANGELOG.md | 1 + src/media-understanding/runner.ts | 73 +++++++++-- .../runner.vision-skip.test.ts | 122 +++++++++++++++++- 3 files changed, 183 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e72bbe53426..a1b746703d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171. - Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201. - Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated. - Models/CLI: keep `openclaw models list` read-only while still showing eligible configured-provider rows, so listing models no longer rewrites per-agent `models.json`. (#70847) Thanks @shakkernerd. diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 6f6f74be2bd..12b3c6d99f3 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -85,6 +85,15 @@ function resolveConfiguredImageModelId(params: { cfg: OpenClawConfig; providerId: string; }): string | undefined { + const configured = resolveConfiguredImageModel(params); + const id = configured?.id?.trim(); + return id || undefined; +} + +function resolveConfiguredImageModel(params: { + cfg: OpenClawConfig; + providerId: string; +}): { id?: string; input?: string[] } | undefined { const providerCfg = findNormalizedProviderValue( params.cfg.models?.providers, params.providerId, @@ -96,12 +105,10 @@ function resolveConfiguredImageModelId(params: { }>; } | undefined; - const configured = providerCfg?.models?.find((entry) => { + return providerCfg?.models?.find((entry) => { const id = entry?.id?.trim(); return Boolean(id) && entry?.input?.includes("image"); }); - const id = configured?.id?.trim(); - return id || undefined; } function resolveCatalogImageModelId(params: { @@ -119,6 +126,23 @@ function resolveCatalogImageModelId(params: { return normalizeOptionalString((autoEntry ?? matches[0])?.id); } +async function explicitImageModelVisionStatus(params: { + cfg: OpenClawConfig; + providerId: string; + model: string; +}): Promise<"supported" | "unsupported" | "unknown"> { + const configured = resolveConfiguredImageModel(params); + if (configured?.id?.trim() === params.model && configured.input?.includes("image")) { + return "supported"; + } + const catalog = await loadModelCatalog({ config: params.cfg }); + const entry = findModelInCatalog(catalog, params.providerId, params.model); + if (!entry) { + return "unknown"; + } + return modelSupportsVision(entry) ? "supported" : "unsupported"; +} + async function resolveAutoImageModelId(params: { cfg: OpenClawConfig; providerId: string; @@ -126,7 +150,14 @@ async function resolveAutoImageModelId(params: { }): Promise { const explicit = normalizeOptionalString(params.explicitModel); if (explicit) { - return explicit; + const explicitStatus = await explicitImageModelVisionStatus({ + cfg: params.cfg, + providerId: params.providerId, + model: explicit, + }); + if (explicitStatus !== "unsupported") { + return explicit; + } } const configuredModel = resolveConfiguredImageModelId(params); if (configuredModel) { @@ -498,6 +529,16 @@ function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstan return entries; } +function hasExplicitImageUnderstandingConfig(params: { + cfg: OpenClawConfig; + config?: MediaUnderstandingConfig; +}): boolean { + return ( + (params.config?.models?.length ?? 0) > 0 || + resolveImageModelFromAgentDefaults(params.cfg).length > 0 + ); +} + async function resolveAutoEntries(params: { cfg: OpenClawConfig; agentDir?: string; @@ -505,6 +546,12 @@ async function resolveAutoEntries(params: { capability: MediaUnderstandingCapability; activeModel?: ActiveMediaModel; }): Promise { + if (params.capability === "image") { + const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); + if (imageModelEntries.length > 0) { + return imageModelEntries; + } + } const activeEntry = await resolveActiveModelEntry(params); if (activeEntry) { return [activeEntry]; @@ -519,12 +566,6 @@ async function resolveAutoEntries(params: { return [localAudio]; } } - if (params.capability === "image") { - const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); - if (imageModelEntries.length > 0) { - return imageModelEntries; - } - } const gemini = await resolveGeminiCliEntry(params.capability); if (gemini) { return [gemini]; @@ -553,6 +594,12 @@ export async function resolveAutoImageModel(params: { } return { provider, model }; }; + const configuredImageModel = resolveImageModelFromAgentDefaults(params.cfg) + .map((entry) => toActive(entry)) + .find((entry): entry is ActiveMediaModel => entry !== null); + if (configuredImageModel) { + return configuredImageModel; + } const activeEntry = await resolveActiveModelEntry({ cfg: params.cfg, agentDir: params.agentDir, @@ -772,7 +819,11 @@ export async function runCapability(params: { // Skip image understanding when the primary model supports vision natively. // The image will be injected directly into the model context instead. const activeProvider = params.activeModel?.provider?.trim(); - if (capability === "image" && activeProvider) { + if ( + capability === "image" && + activeProvider && + !hasExplicitImageUnderstandingConfig({ cfg, config }) + ) { const catalog = await loadModelCatalog({ config: cfg }); const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? ""); if (modelSupportsVision(entry)) { diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts index 0ca988bf791..a26a669d363 100644 --- a/src/media-understanding/runner.vision-skip.test.ts +++ b/src/media-understanding/runner.vision-skip.test.ts @@ -13,7 +13,14 @@ import { setActivePluginRegistry } from "../plugins/runtime.js"; import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js"; import { withMediaFixture } from "./runner.test-utils.js"; -const baseCatalog = [ +type TestCatalogEntry = { + id: string; + name: string; + provider: string; + input: readonly string[]; +}; + +const baseCatalog: TestCatalogEntry[] = [ { id: "gpt-4.1", name: "GPT-4.1", @@ -21,7 +28,7 @@ const baseCatalog = [ input: ["text", "image"] as const, }, ]; -let catalog = [...baseCatalog]; +let catalog: TestCatalogEntry[] = [...baseCatalog]; const loadModelCatalog = vi.hoisted(() => vi.fn(async () => catalog)); @@ -141,6 +148,117 @@ describe("runCapability image skip", () => { } }); + it("uses explicit media image models instead of native vision skip", async () => { + await withMediaFixture( + { + filePrefix: "openclaw-image-explicit-vision", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const cfg = {} as OpenClawConfig; + + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: new Map([ + [ + "openrouter", + { + id: "openrouter", + capabilities: ["image"], + describeImage: async (req) => ({ text: "explicit ok", model: req.model }), + }, + ], + ]), + config: { + models: [{ provider: "openrouter", model: "google/gemini-2.5-flash" }], + }, + activeModel: { provider: "openai", model: "gpt-4.1" }, + }); + + expect(result.decision.outcome).toBe("success"); + expect(result.outputs[0]).toMatchObject({ + provider: "openrouter", + model: "google/gemini-2.5-flash", + text: "explicit ok", + }); + }, + ); + }); + + it("prefers agents.defaults.imageModel over the active model for auto image resolution", async () => { + const cfg = { + agents: { + defaults: { + imageModel: { primary: "openrouter/google/gemini-2.5-flash" }, + }, + }, + } as OpenClawConfig; + + await expect( + resolveAutoImageModel({ + cfg, + activeModel: { provider: "openai", model: "gpt-4.1" }, + }), + ).resolves.toEqual({ + provider: "openrouter", + model: "google/gemini-2.5-flash", + }); + }); + + it("falls back from an active text model to the provider image default", async () => { + catalog = [ + { + id: "MiniMax-M2.7", + name: "MiniMax M2.7", + provider: "minimax-portal", + input: ["text"] as const, + }, + { + id: "MiniMax-VL-01", + name: "MiniMax VL 01", + provider: "minimax-portal", + input: ["text", "image"] as const, + }, + ]; + vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key"); + const cfg = {} as OpenClawConfig; + const pluginRegistry = createEmptyPluginRegistry(); + pluginRegistry.mediaUnderstandingProviders.push({ + pluginId: "minimax", + pluginName: "MiniMax Provider", + source: "test", + provider: { + id: "minimax-portal", + capabilities: ["image"], + defaultModels: { image: "MiniMax-VL-01" }, + describeImage: async () => ({ text: "ok" }), + }, + }); + setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg); + + try { + await expect( + resolveAutoImageModel({ + cfg, + activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" }, + }), + ).resolves.toEqual({ + provider: "minimax-portal", + model: "MiniMax-VL-01", + }); + } finally { + setActivePluginRegistry(createEmptyPluginRegistry()); + vi.unstubAllEnvs(); + } + }); + it("uses active OpenRouter image models for auto image resolution", async () => { vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key"); const cfg = {} as OpenClawConfig;