From d268c850e61a55422ea335f5fbf8154a1fcf24ce Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Fri, 24 Apr 2026 02:21:25 +0100
Subject: [PATCH] fix: honor explicit media image model routing

---
 CHANGELOG.md                                  |   1 +
 src/media-understanding/runner.ts             |  73 +++++++++--
 .../runner.vision-skip.test.ts                | 122 +++++++++++++++++-
 3 files changed, 183 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e72bbe53426..a1b746703d1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
 - Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
 - Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.
 - Models/CLI: keep `openclaw models list` read-only while still showing eligible configured-provider rows, so listing models no longer rewrites per-agent `models.json`. (#70847) Thanks @shakkernerd.
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 6f6f74be2bd..12b3c6d99f3 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -85,6 +85,15 @@ function resolveConfiguredImageModelId(params: {
   cfg: OpenClawConfig;
   providerId: string;
 }): string | undefined {
+  const configured = resolveConfiguredImageModel(params);
+  const id = configured?.id?.trim();
+  return id || undefined;
+}
+
+function resolveConfiguredImageModel(params: {
+  cfg: OpenClawConfig;
+  providerId: string;
+}): { id?: string; input?: string[] } | undefined {
   const providerCfg = findNormalizedProviderValue(
     params.cfg.models?.providers,
     params.providerId,
@@ -96,12 +105,10 @@ function resolveConfiguredImageModelId(params: {
         }>;
       }
     | undefined;
-  const configured = providerCfg?.models?.find((entry) => {
+  return providerCfg?.models?.find((entry) => {
     const id = entry?.id?.trim();
     return Boolean(id) && entry?.input?.includes("image");
   });
-  const id = configured?.id?.trim();
-  return id || undefined;
 }
 
 function resolveCatalogImageModelId(params: {
@@ -119,6 +126,23 @@ function resolveCatalogImageModelId(params: {
   return normalizeOptionalString((autoEntry ?? matches[0])?.id);
 }
 
+async function explicitImageModelVisionStatus(params: {
+  cfg: OpenClawConfig;
+  providerId: string;
+  model: string;
+}): Promise<"supported" | "unsupported" | "unknown"> {
+  const configured = resolveConfiguredImageModel(params);
+  if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
+    return "supported";
+  }
+  const catalog = await loadModelCatalog({ config: params.cfg });
+  const entry = findModelInCatalog(catalog, params.providerId, params.model);
+  if (!entry) {
+    return "unknown";
+  }
+  return modelSupportsVision(entry) ? "supported" : "unsupported";
+}
+
 async function resolveAutoImageModelId(params: {
   cfg: OpenClawConfig;
   providerId: string;
@@ -126,7 +150,14 @@ async function resolveAutoImageModelId(params: {
 }): Promise<string | undefined> {
   const explicit = normalizeOptionalString(params.explicitModel);
   if (explicit) {
-    return explicit;
+    const explicitStatus = await explicitImageModelVisionStatus({
+      cfg: params.cfg,
+      providerId: params.providerId,
+      model: explicit,
+    });
+    if (explicitStatus !== "unsupported") {
+      return explicit;
+    }
   }
   const configuredModel = resolveConfiguredImageModelId(params);
   if (configuredModel) {
@@ -498,6 +529,16 @@ function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstan
   return entries;
 }
 
+function hasExplicitImageUnderstandingConfig(params: {
+  cfg: OpenClawConfig;
+  config?: MediaUnderstandingConfig;
+}): boolean {
+  return (
+    (params.config?.models?.length ?? 0) > 0 ||
+    resolveImageModelFromAgentDefaults(params.cfg).length > 0
+  );
+}
+
 async function resolveAutoEntries(params: {
   cfg: OpenClawConfig;
   agentDir?: string;
@@ -505,6 +546,12 @@ async function resolveAutoEntries(params: {
   capability: MediaUnderstandingCapability;
   activeModel?: ActiveMediaModel;
 }): Promise<MediaUnderstandingModelConfig[]> {
+  if (params.capability === "image") {
+    const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
+    if (imageModelEntries.length > 0) {
+      return imageModelEntries;
+    }
+  }
   const activeEntry = await resolveActiveModelEntry(params);
   if (activeEntry) {
     return [activeEntry];
@@ -519,12 +566,6 @@ async function resolveAutoEntries(params: {
       return [localAudio];
     }
   }
-  if (params.capability === "image") {
-    const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
-    if (imageModelEntries.length > 0) {
-      return imageModelEntries;
-    }
-  }
   const gemini = await resolveGeminiCliEntry(params.capability);
   if (gemini) {
     return [gemini];
@@ -553,6 +594,12 @@ export async function resolveAutoImageModel(params: {
     }
     return { provider, model };
   };
+  const configuredImageModel = resolveImageModelFromAgentDefaults(params.cfg)
+    .map((entry) => toActive(entry))
+    .find((entry): entry is ActiveMediaModel => entry !== null);
+  if (configuredImageModel) {
+    return configuredImageModel;
+  }
   const activeEntry = await resolveActiveModelEntry({
     cfg: params.cfg,
     agentDir: params.agentDir,
@@ -772,7 +819,11 @@ export async function runCapability(params: {
   // Skip image understanding when the primary model supports vision natively.
   // The image will be injected directly into the model context instead.
   const activeProvider = params.activeModel?.provider?.trim();
-  if (capability === "image" && activeProvider) {
+  if (
+    capability === "image" &&
+    activeProvider &&
+    !hasExplicitImageUnderstandingConfig({ cfg, config })
+  ) {
     const catalog = await loadModelCatalog({ config: cfg });
     const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
     if (modelSupportsVision(entry)) {
diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts
index 0ca988bf791..a26a669d363 100644
--- a/src/media-understanding/runner.vision-skip.test.ts
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -13,7 +13,14 @@ import { setActivePluginRegistry } from "../plugins/runtime.js";
 import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
 import { withMediaFixture } from "./runner.test-utils.js";
 
-const baseCatalog = [
+type TestCatalogEntry = {
+  id: string;
+  name: string;
+  provider: string;
+  input: readonly string[];
+};
+
+const baseCatalog: TestCatalogEntry[] = [
   {
     id: "gpt-4.1",
     name: "GPT-4.1",
@@ -21,7 +28,7 @@ const baseCatalog = [
     input: ["text", "image"] as const,
   },
 ];
-let catalog = [...baseCatalog];
+let catalog: TestCatalogEntry[] = [...baseCatalog];
 
 const loadModelCatalog = vi.hoisted(() => vi.fn(async () => catalog));
 
@@ -141,6 +148,117 @@ describe("runCapability image skip", () => {
     }
   });
 
+  it("uses explicit media image models instead of native vision skip", async () => {
+    await withMediaFixture(
+      {
+        filePrefix: "openclaw-image-explicit-vision",
+        extension: "png",
+        mediaType: "image/png",
+        fileContents: Buffer.from("image"),
+      },
+      async ({ ctx, media, cache }) => {
+        const cfg = {} as OpenClawConfig;
+
+        const result = await runCapability({
+          capability: "image",
+          cfg,
+          ctx,
+          attachments: cache,
+          media,
+          agentDir: "/tmp",
+          providerRegistry: new Map([
+            [
+              "openrouter",
+              {
+                id: "openrouter",
+                capabilities: ["image"],
+                describeImage: async (req) => ({ text: "explicit ok", model: req.model }),
+              },
+            ],
+          ]),
+          config: {
+            models: [{ provider: "openrouter", model: "google/gemini-2.5-flash" }],
+          },
+          activeModel: { provider: "openai", model: "gpt-4.1" },
+        });
+
+        expect(result.decision.outcome).toBe("success");
+        expect(result.outputs[0]).toMatchObject({
+          provider: "openrouter",
+          model: "google/gemini-2.5-flash",
+          text: "explicit ok",
+        });
+      },
+    );
+  });
+
+  it("prefers agents.defaults.imageModel over the active model for auto image resolution", async () => {
+    const cfg = {
+      agents: {
+        defaults: {
+          imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
+        },
+      },
+    } as OpenClawConfig;
+
+    await expect(
+      resolveAutoImageModel({
+        cfg,
+        activeModel: { provider: "openai", model: "gpt-4.1" },
+      }),
+    ).resolves.toEqual({
+      provider: "openrouter",
+      model: "google/gemini-2.5-flash",
+    });
+  });
+
+  it("falls back from an active text model to the provider image default", async () => {
+    catalog = [
+      {
+        id: "MiniMax-M2.7",
+        name: "MiniMax M2.7",
+        provider: "minimax-portal",
+        input: ["text"] as const,
+      },
+      {
+        id: "MiniMax-VL-01",
+        name: "MiniMax VL 01",
+        provider: "minimax-portal",
+        input: ["text", "image"] as const,
+      },
+    ];
+    vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
+    const cfg = {} as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "minimax",
+      pluginName: "MiniMax Provider",
+      source: "test",
+      provider: {
+        id: "minimax-portal",
+        capabilities: ["image"],
+        defaultModels: { image: "MiniMax-VL-01" },
+        describeImage: async () => ({ text: "ok" }),
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await expect(
+        resolveAutoImageModel({
+          cfg,
+          activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
+        }),
+      ).resolves.toEqual({
+        provider: "minimax-portal",
+        model: "MiniMax-VL-01",
+      });
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
   it("uses active OpenRouter image models for auto image resolution", async () => {
     vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
     const cfg = {} as OpenClawConfig;