From 5d1f7bf058949da6be6a614a66a4e1d1fa8f245e Mon Sep 17 00:00:00 2001
From: Peter Steinberger <peter@steipete.me>
Date: Sun, 17 May 2026 08:45:50 +0100
Subject: [PATCH] fix: route image URL describes through MiniMax VLM

Summary:
- Preserve HTTP image describe inputs as remote media.
- Route MiniMax CN image understanding through MiniMax-VL-01.
- Cover CLI, media runtime, tools, Telegram stickers, docs, and changelog.

Verification:
- codex-review clean
- pnpm check:changed via Blacksmith Testbox tbx_01krtdekwak0mygxbw5z7cfb6z
- PR CI green on 516281448e6d5499ce17928d820f1c4d24a0b612
---
 CHANGELOG.md                                  |   1 +
 docs/cli/infer.md                             |  28 +-
 docs/nodes/media-understanding.md             |   4 +-
 .../src/sticker-cache.describe.test.ts        | 125 +++++++
 extensions/telegram/src/sticker-cache.ts      |  26 +-
 .../minimax-vlm.normalizes-api-key.test.ts    |  57 ++++
 src/agents/minimax-vlm.ts                     |  27 +-
 src/agents/tools/image-tool.helpers.ts        |   4 +
 src/agents/tools/image-tool.test.ts           | 125 +++++++
 src/agents/tools/image-tool.ts                |  67 +++-
 .../tools/pdf-tool.model-config.test.ts       |  32 ++
 src/agents/tools/pdf-tool.model-config.ts     |   4 +
 src/cli/capability-cli.test.ts                |  42 +++
 src/cli/capability-cli.ts                     |   9 +-
 src/media-understanding/attachments.cache.ts  |  12 +-
 src/media-understanding/defaults.test.ts      |  54 +++
 src/media-understanding/defaults.ts           |  49 ++-
 src/media-understanding/image.test.ts         | 129 +++++++
 src/media-understanding/image.ts              |  55 ++-
 .../media-understanding-misc.test.ts          |  22 ++
 src/media-understanding/provider-id.ts        |  14 +
 src/media-understanding/provider-registry.ts  |   2 +-
 src/media-understanding/runner.entries.ts     |   8 +-
 src/media-understanding/runner.ts             |  29 +-
 .../runner.vision-skip.test.ts                | 316 +++++++++++++++++-
 src/media-understanding/runtime.test.ts       | 127 +++++++
 src/media-understanding/runtime.ts            | 134 ++++++--
 27 files changed, 1425 insertions(+), 77 deletions(-)
 create mode 100644 extensions/telegram/src/sticker-cache.describe.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80aaa56905c..f75edb650b9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -85,6 +85,7 @@ Docs: https://docs.openclaw.ai
 - Agents/followups: route queued followup turns through CLI runtime backends instead of embedded harness lookup, preventing `claude-cli`/`google-gemini-cli` followups from failing before delivery. Fixes #82847. (#82857) Thanks @hclsys.
 - CLI/sessions: let `openclaw sessions cleanup --fix-missing` prune malformed rows with unresolvable transcript metadata instead of throwing. Fixes #80970. (#82745) Thanks @IWhatsskill.
 - Gateway/usage: refresh large session usage summaries in the background and reuse durable transcript metadata so `sessions.usage` no longer blocks Gateway requests on full transcript rescans. Fixes #82773. (#82778) Thanks @hclsys.
+- CLI/MiniMax media: let `openclaw infer image describe --file` accept HTTP(S) image URLs without treating them as local paths, and keep automatic MiniMax image understanding routed through `MiniMax-VL-01` even when legacy MiniMax M2.x chat metadata claims image input. Fixes #82837. Thanks @mGaolin.
 - TUI: restore the submitted draft when chat is busy instead of clearing it or queueing another run. Fixes #45326. (#82774) Thanks @hyspacex.
 - Cron/memory: treat claimed `before_agent_reply` cron hooks as execution progress, so long memory dreaming promotion jobs are not aborted by the isolated-run pre-execution watchdog. Fixes #82811.
 - Discord: recover transcript-backed full answers when progress-mode final payloads are ellipsis-truncated, so long replies fall back to normal chunked delivery instead of replacing the preview with a shortened message. Fixes #82807. Thanks @blueberry6401.
diff --git a/docs/cli/infer.md b/docs/cli/infer.md
index b28b75ec438..98f8b5b905e 100644
--- a/docs/cli/infer.md
+++ b/docs/cli/infer.md
@@ -107,19 +107,19 @@ runtime before the provider request is made.
 
 This table maps common inference tasks to the corresponding infer command.
 
-| Task                         | Command                                                                                       | Notes                                                 |
-| ---------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
-| Run a text/model prompt      | `openclaw infer model run --prompt "..." --json`                                              | Uses the normal local path by default                 |
-| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs             |
-| Generate an image            | `openclaw infer image generate --prompt "..." --json`                                         | Use `image edit` when starting from an existing file  |
-| Describe an image file       | `openclaw infer image describe --file ./image.png --prompt "..." --json`                      | `--model` must be an image-capable `<provider/model>` |
-| Transcribe audio             | `openclaw infer audio transcribe --file ./memo.m4a --json`                                    | `--model` must be `<provider/model>`                  |
-| Synthesize speech            | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`                        | `tts status` is gateway-oriented                      |
-| Generate a video             | `openclaw infer video generate --prompt "..." --json`                                         | Supports provider hints such as `--resolution`        |
-| Describe a video file        | `openclaw infer video describe --file ./clip.mp4 --json`                                      | `--model` must be `<provider/model>`                  |
-| Search the web               | `openclaw infer web search --query "..." --json`                                              |                                                       |
-| Fetch a web page             | `openclaw infer web fetch --url https://example.com --json`                                   |                                                       |
-| Create embeddings            | `openclaw infer embedding create --text "..." --json`                                         |                                                       |
+| Task                          | Command                                                                                       | Notes                                                 |
+| ----------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| Run a text/model prompt       | `openclaw infer model run --prompt "..." --json`                                              | Uses the normal local path by default                 |
+| Run a model prompt on images  | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs             |
+| Generate an image             | `openclaw infer image generate --prompt "..." --json`                                         | Use `image edit` when starting from an existing file  |
+| Describe an image file or URL | `openclaw infer image describe --file ./image.png --prompt "..." --json`                      | `--model` must be an image-capable `<provider/model>` |
+| Transcribe audio              | `openclaw infer audio transcribe --file ./memo.m4a --json`                                    | `--model` must be `<provider/model>`                  |
+| Synthesize speech             | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`                        | `tts status` is gateway-oriented                      |
+| Generate a video              | `openclaw infer video generate --prompt "..." --json`                                         | Supports provider hints such as `--resolution`        |
+| Describe a video file         | `openclaw infer video describe --file ./clip.mp4 --json`                                      | `--model` must be `<provider/model>`                  |
+| Search the web                | `openclaw infer web search --query "..." --json`                                              |                                                       |
+| Fetch a web page              | `openclaw infer web fetch --url https://example.com --json`                                   |                                                       |
+| Create embeddings             | `openclaw infer embedding create --text "..." --json`                                         |                                                       |
 
 ## Behavior
 
@@ -128,6 +128,7 @@ This table maps common inference tasks to the corresponding infer command.
 - Use `--provider` or `--model provider/model` when a specific backend is required.
 - Use `model run --thinking <level>` to pass a one-shot thinking/reasoning level (`off`, `minimal`, `low`, `medium`, `high`, `adaptive`, `xhigh`, or `max`) while keeping the run raw.
 - For `image describe`, `audio transcribe`, and `video describe`, `--model` must use the form `<provider/model>`.
+- For `image describe`, `--file` accepts local paths and HTTP(S) image URLs. Remote URLs use the normal media-fetch SSRF policy.
 - For `image describe`, an explicit `--model` runs that provider/model directly. The model must be image-capable in the model catalog or provider config. `codex/<model>` runs a bounded Codex app-server image-understanding turn; `openai-codex/<model>` uses the OpenAI Codex OAuth provider path.
 - Stateless execution commands default to local.
 - Gateway-managed state commands default to gateway.
@@ -192,6 +193,7 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000
 openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json
 openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json
 openclaw infer image describe --file ./photo.jpg --json
+openclaw infer image describe --file https://example.com/photo.png --json
 openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json
 openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json
 openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json
diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md
index 8c48588e494..40714dae2f4 100644
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -260,8 +260,8 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
 <Note>
 **MiniMax note**
 
-- `minimax` and `minimax-portal` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
-- The bundled MiniMax text catalog still starts text-only; explicit `models.providers.minimax` entries materialize image-capable M2.7 chat refs.
+- `minimax`, `minimax-cn`, `minimax-portal`, and `minimax-portal-cn` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
+- Automatic image routing keeps using `MiniMax-VL-01` even if legacy MiniMax M2.x chat metadata claims image input.
 
 </Note>
 
diff --git a/extensions/telegram/src/sticker-cache.describe.test.ts b/extensions/telegram/src/sticker-cache.describe.test.ts
new file mode 100644
index 00000000000..803750a2e71
--- /dev/null
+++ b/extensions/telegram/src/sticker-cache.describe.test.ts
@@ -0,0 +1,125 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { describeStickerImage } from "./sticker-cache.js";
+
+const mocks = vi.hoisted(() => {
+  const describeImageFileWithModel = vi.fn(async () => ({
+    text: "vlm ok",
+    model: "MiniMax-VL-01",
+  }));
+  return {
+    describeImageFileWithModel,
+    findModelInCatalog: vi.fn((_catalog, provider: string, model: string) => ({
+      provider,
+      id: model,
+      input: ["text", "image"],
+    })),
+    loadModelCatalog: vi.fn(async () => [
+      { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
+      { provider: "minimax", id: "MiniMax-M2.7", input: ["text", "image"] },
+    ]),
+    modelSupportsVision: vi.fn((entry: { input?: string[] } | undefined) =>
+      Boolean(entry?.input?.includes("image")),
+    ),
+    resolveApiKeyForProvider: vi.fn(async () => ({ apiKey: "minimax-test" })),
+    resolveAutoImageModel: vi.fn(async () => ({
+      provider: "minimax-cn",
+      model: "MiniMax-VL-01",
+    })),
+    resolveAutoMediaKeyProviders: vi.fn(() => ["minimax-cn", "minimax"]),
+    resolveDefaultMediaModel: vi.fn(() => "MiniMax-VL-01"),
+    resolveDefaultModelForAgent: vi.fn(() => ({
+      provider: "minimax-cn",
+      model: "MiniMax-M2.7",
+    })),
+  };
+});
+
+vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({
+  findModelInCatalog: mocks.findModelInCatalog,
+  loadModelCatalog: mocks.loadModelCatalog,
+  modelSupportsVision: mocks.modelSupportsVision,
+  resolveApiKeyForProvider: mocks.resolveApiKeyForProvider,
+  resolveDefaultModelForAgent: mocks.resolveDefaultModelForAgent,
+}));
+
+vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
+  resolveAutoImageModel: mocks.resolveAutoImageModel,
+  resolveAutoMediaKeyProviders: mocks.resolveAutoMediaKeyProviders,
+  resolveDefaultMediaModel: mocks.resolveDefaultMediaModel,
+}));
+
+vi.mock("./runtime.js", () => ({
+  getTelegramRuntime: () => ({
+    mediaUnderstanding: {
+      describeImageFileWithModel: mocks.describeImageFileWithModel,
+    },
+  }),
+}));
+
+describe("describeStickerImage", () => {
+  beforeEach(() => {
+    mocks.describeImageFileWithModel.mockClear();
+    mocks.findModelInCatalog.mockClear();
+    mocks.loadModelCatalog.mockClear();
+    mocks.modelSupportsVision.mockClear();
+    mocks.resolveApiKeyForProvider.mockClear();
+    mocks.resolveAutoImageModel.mockClear();
+    mocks.resolveAutoMediaKeyProviders.mockClear();
+    mocks.resolveDefaultMediaModel.mockClear();
+    mocks.resolveDefaultModelForAgent.mockClear();
+  });
+
+  it("uses MiniMax VLM auto selection instead of legacy chat vision catalog entries", async () => {
+    await expect(
+      describeStickerImage({
+        imagePath: "/tmp/sticker.webp",
+        cfg: {},
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toBe("vlm ok");
+
+    expect(mocks.resolveDefaultMediaModel).toHaveBeenCalledWith({
+      cfg: {},
+      providerId: "minimax-cn",
+      capability: "image",
+      includeConfiguredImageModels: false,
+    });
+    expect(mocks.resolveAutoImageModel).not.toHaveBeenCalled();
+    expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        filePath: "/tmp/sticker.webp",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+      }),
+    );
+  });
+
+  it("keeps MiniMax chat defaults on MiniMax VLM when other vision providers are configured", async () => {
+    mocks.resolveAutoMediaKeyProviders.mockReturnValue(["openai", "minimax-cn", "minimax"]);
+    mocks.loadModelCatalog.mockResolvedValue([
+      { provider: "openai", id: "gpt-5.4", input: ["text", "image"] },
+      { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
+      { provider: "minimax-cn", id: "MiniMax-VL-01", input: ["image"] },
+    ]);
+
+    await expect(
+      describeStickerImage({
+        imagePath: "/tmp/sticker.webp",
+        cfg: {},
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toBe("vlm ok");
+
+    expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+      }),
+    );
+    expect(mocks.describeImageFileWithModel).not.toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "openai",
+      }),
+    );
+  });
+});
diff --git a/extensions/telegram/src/sticker-cache.ts b/extensions/telegram/src/sticker-cache.ts
index 3bb158fd099..a9dd3d30f78 100644
--- a/extensions/telegram/src/sticker-cache.ts
+++ b/extensions/telegram/src/sticker-cache.ts
@@ -27,6 +27,16 @@ export {
 const STICKER_DESCRIPTION_PROMPT =
   "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
 
+function isMinimaxVlmProvider(provider: string): boolean {
+  const normalized = normalizeLowercaseStringOrEmpty(provider);
+  return (
+    normalized === "minimax" ||
+    normalized === "minimax-cn" ||
+    normalized === "minimax-portal" ||
+    normalized === "minimax-portal-cn"
+  );
+}
+
 export interface DescribeStickerParams {
   imagePath: string;
   cfg: OpenClawConfig;
@@ -50,7 +60,17 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
     const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
     const supportsVision = modelSupportsVision(entry);
     if (supportsVision) {
-      activeModel = { provider: defaultModel.provider, model: defaultModel.model };
+      const model = isMinimaxVlmProvider(defaultModel.provider)
+        ? resolveDefaultMediaModel({
+            cfg,
+            providerId: defaultModel.provider,
+            capability: "image",
+            includeConfiguredImageModels: false,
+          })
+        : defaultModel.model;
+      if (model) {
+        activeModel = { provider: defaultModel.provider, model };
+      }
     }
   } catch {
     // Ignore catalog failures; fall back to auto selection.
@@ -83,8 +103,12 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
       cfg,
       providerId: provider,
       capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(provider),
     });
     const preferred = entries.find((entry) => entry.id === defaultId);
+    if (isMinimaxVlmProvider(provider)) {
+      return preferred;
+    }
     return preferred ?? entries[0];
   };
 
diff --git a/src/agents/minimax-vlm.normalizes-api-key.test.ts b/src/agents/minimax-vlm.normalizes-api-key.test.ts
index e4f470d93cb..31f2ba21f5e 100644
--- a/src/agents/minimax-vlm.normalizes-api-key.test.ts
+++ b/src/agents/minimax-vlm.normalizes-api-key.test.ts
@@ -75,6 +75,61 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
     expect(fetchSpy).toHaveBeenCalledOnce();
   });
 
+  it.each(["minimax-cn", "minimax-portal-cn"])(
+    "routes %s to the CN VLM host by default",
+    async (provider) => {
+      const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
+        const requestUrl =
+          typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
+        expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+        return new Response(apiResponse, {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        });
+      });
+      global.fetch = withFetchPreconnect(fetchSpy);
+
+      await expect(
+        minimaxUnderstandImage({
+          apiKey: "minimax-test-key",
+          provider,
+          prompt: "hi",
+          imageDataUrl: "data:image/png;base64,AAAA",
+        }),
+      ).resolves.toBe("ok");
+
+      expect(fetchSpy).toHaveBeenCalledOnce();
+    },
+  );
+
+  it.each(["minimax-cn", "minimax-portal-cn"])(
+    "keeps %s on the CN VLM host when the configured host is malformed",
+    async (provider) => {
+      const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
+        const requestUrl =
+          typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
+        expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+        return new Response(apiResponse, {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        });
+      });
+      global.fetch = withFetchPreconnect(fetchSpy);
+
+      await expect(
+        minimaxUnderstandImage({
+          apiKey: "minimax-test-key",
+          provider,
+          apiHost: "https://[",
+          prompt: "hi",
+          imageDataUrl: "data:image/png;base64,AAAA",
+        }),
+      ).resolves.toBe("ok");
+
+      expect(fetchSpy).toHaveBeenCalledOnce();
+    },
+  );
+
   it("uses the caller-provided request timeout", async () => {
     const timeoutSpy = vi.spyOn(AbortSignal, "timeout");
     const fetchSpy = vi.fn(async () => {
@@ -103,7 +158,9 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
 describe("isMinimaxVlmModel", () => {
   it("only matches the canonical MiniMax VLM model id", () => {
     expect(isMinimaxVlmModel("minimax", "MiniMax-VL-01")).toBe(true);
+    expect(isMinimaxVlmModel("minimax-cn", "MiniMax-VL-01")).toBe(true);
     expect(isMinimaxVlmModel("minimax-portal", "MiniMax-VL-01")).toBe(true);
+    expect(isMinimaxVlmModel("minimax-portal-cn", "MiniMax-VL-01")).toBe(true);
     expect(isMinimaxVlmModel("minimax-portal", "custom-vision")).toBe(false);
     expect(isMinimaxVlmModel("openai", "MiniMax-VL-01")).toBe(false);
   });
diff --git a/src/agents/minimax-vlm.ts b/src/agents/minimax-vlm.ts
index bde911a17e0..fa002249515 100644
--- a/src/agents/minimax-vlm.ts
+++ b/src/agents/minimax-vlm.ts
@@ -8,35 +8,54 @@ type MinimaxBaseResp = {
 };
 
 export function isMinimaxVlmProvider(provider: string): boolean {
-  return provider === "minimax" || provider === "minimax-portal";
+  const normalized = provider.trim().toLowerCase();
+  return (
+    normalized === "minimax" ||
+    normalized === "minimax-cn" ||
+    normalized === "minimax-portal" ||
+    normalized === "minimax-portal-cn"
+  );
 }
 
 export function isMinimaxVlmModel(provider: string, modelId: string): boolean {
   return isMinimaxVlmProvider(provider) && modelId.trim() === "MiniMax-VL-01";
 }
 
+function isMinimaxCnProvider(provider: string | undefined): boolean {
+  const normalized = provider?.trim().toLowerCase();
+  return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
+}
+
 function coerceApiHost(params: {
   apiHost?: string;
   modelBaseUrl?: string;
+  provider?: string;
   env?: NodeJS.ProcessEnv;
 }): string {
   const env = params.env ?? process.env;
+  const defaultHost = isMinimaxCnProvider(params.provider)
+    ? "https://api.minimaxi.com"
+    : "https://api.minimax.io";
   const raw =
     params.apiHost?.trim() ||
     env.MINIMAX_API_HOST?.trim() ||
     params.modelBaseUrl?.trim() ||
-    "https://api.minimax.io";
+    defaultHost;
 
   try {
     const url = new URL(raw);
     return url.origin;
   } catch {}
 
+  if (/^[a-z][a-z\d+.-]*:\/\//i.test(raw)) {
+    return defaultHost;
+  }
+
   try {
     const url = new URL(`https://${raw}`);
     return url.origin;
   } catch {
-    return "https://api.minimax.io";
+    return defaultHost;
   }
 }
 
@@ -51,6 +70,7 @@ export async function minimaxUnderstandImage(params: {
   imageDataUrl: string;
   apiHost?: string;
   modelBaseUrl?: string;
+  provider?: string;
   timeoutMs?: number;
 }): Promise<string> {
   const apiKey = normalizeSecretInput(params.apiKey);
@@ -72,6 +92,7 @@ export async function minimaxUnderstandImage(params: {
   const host = coerceApiHost({
     apiHost: params.apiHost,
     modelBaseUrl: params.modelBaseUrl,
+    provider: params.provider,
   });
   const url = new URL("/v1/coding_plan/vlm", host).toString();
 
diff --git a/src/agents/tools/image-tool.helpers.ts b/src/agents/tools/image-tool.helpers.ts
index ab7a178c6d8..58508304624 100644
--- a/src/agents/tools/image-tool.helpers.ts
+++ b/src/agents/tools/image-tool.helpers.ts
@@ -2,6 +2,7 @@ import type { AssistantMessage } from "@earendil-works/pi-ai";
 import type { OpenClawConfig } from "../../config/types.openclaw.js";
 import { estimateBase64DecodedBytes } from "../../media/base64.js";
 import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
+import { isMinimaxVlmProvider } from "../minimax-vlm.js";
 import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
 import { extractAssistantText } from "../pi-embedded-utils.js";
 import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";
@@ -238,6 +239,9 @@ export function resolveProviderVisionModelFromConfig(params: {
   cfg?: OpenClawConfig;
   provider: string;
 }): string | null {
+  if (isMinimaxVlmProvider(params.provider)) {
+    return null;
+  }
   const providerCfg = findNormalizedProviderValue(
     params.cfg?.models?.providers,
     params.provider,
diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts
index 383c5247a23..601a66bee83 100644
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -181,7 +181,9 @@ async function createOpenClawCodingToolsWithFreshModules(options?: CreateOpenCla
   const defaultImageModels = new Map<string, string>([
     ["anthropic", "claude-opus-4-6"],
     ["minimax", "MiniMax-VL-01"],
+    ["minimax-cn", "MiniMax-VL-01"],
     ["minimax-portal", "MiniMax-VL-01"],
+    ["minimax-portal-cn", "MiniMax-VL-01"],
     ["openai", "gpt-5.4-mini"],
     ["opencode", "gpt-5-nano"],
     ["opencode-go", "kimi-k2.6"],
@@ -482,7 +484,9 @@ function installImageUnderstandingProviderStubs(...providers: MediaUnderstanding
   const defaultImageModels = new Map<string, string>([
     ["anthropic", "claude-opus-4-6"],
     ["minimax", "MiniMax-VL-01"],
+    ["minimax-cn", "MiniMax-VL-01"],
     ["minimax-portal", "MiniMax-VL-01"],
+    ["minimax-portal-cn", "MiniMax-VL-01"],
     ["openai", "gpt-5.4-mini"],
     ["opencode", "gpt-5-nano"],
     ["opencode-go", "kimi-k2.6"],
@@ -764,6 +768,127 @@ describe("image tool implicit imageModel config", () => {
     });
   });
 
+  it("keeps MiniMax CN chat metadata off automatic image routing", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const cfg: OpenClawConfig = {
+        agents: { defaults: { model: { primary: "minimax-cn/MiniMax-M2.5" } } },
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax-cn/MiniMax-VL-01",
+      });
+    });
+  });
+
+  it("prefers configured MiniMax CN image alias over canonical auto fallback", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const defaultImageModels = new Map<string, string>([
+        ["anthropic", "claude-opus-4-6"],
+        ["minimax", "MiniMax-VL-01"],
+        ["minimax-cn", "MiniMax-VL-01"],
+        ["openai", "gpt-5.4-mini"],
+      ]);
+      __testing.setProviderDepsForTest({
+        buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
+          imageProviderHarness.buildProviderRegistry(overrides),
+        getMediaUnderstandingProvider: (
+          id: string,
+          registry: Map<string, MediaUnderstandingProvider>,
+        ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
+        describeImageWithModel: describeGenericImageWithModel,
+        describeImagesWithModel: describeGenericImagesWithModel,
+        resolveAutoMediaKeyProviders: ({ capability }) =>
+          capability === "image" ? ["openai", "anthropic", "minimax-cn", "minimax"] : [],
+        resolveDefaultMediaModel: ({ providerId, capability }) =>
+          capability === "image" ? defaultImageModels.get(providerId.toLowerCase()) : undefined,
+      });
+      const cfg: OpenClawConfig = {
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax-cn/MiniMax-VL-01",
+      });
+    });
+  });
+
+  it("keeps canonical MiniMax fallback when configured CN alias has no image candidate", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      __testing.setProviderDepsForTest({
+        buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
+          imageProviderHarness.buildProviderRegistry(overrides),
+        getMediaUnderstandingProvider: (
+          id: string,
+          registry: Map<string, MediaUnderstandingProvider>,
+        ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
+        describeImageWithModel: describeGenericImageWithModel,
+        describeImagesWithModel: describeGenericImagesWithModel,
+        resolveAutoMediaKeyProviders: ({ capability }) =>
+          capability === "image" ? ["minimax"] : [],
+        resolveDefaultMediaModel: ({ providerId, capability }) =>
+          capability === "image" && providerId === "minimax" ? "MiniMax-VL-01" : undefined,
+      });
+      const cfg: OpenClawConfig = {
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax/MiniMax-VL-01",
+      });
+    });
+  });
+
   it("passes the configured image timeout to provider calls", async () => {
     await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
       await withTempAgentDir(async (agentDir) => {
diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts
index 6eeec03d4e5..9eb64e99f1e 100644
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -68,6 +68,50 @@ const imageToolProviderDeps = {
   resolveDefaultMediaModel,
 };
 
+function hasExplicitDefaultPrimaryModel(cfg?: OpenClawConfig): boolean {
+  const model = cfg?.agents?.defaults?.model;
+  if (typeof model === "string") {
+    return model.trim().length > 0;
+  }
+  return typeof model?.primary === "string" && model.primary.trim().length > 0;
+}
+
+function modelRefProvider(candidate: string | null | undefined): string | undefined {
+  const trimmed = candidate?.trim();
+  if (!trimmed?.includes("/")) {
+    return undefined;
+  }
+  return trimmed.slice(0, trimmed.indexOf("/")).trim();
+}
+
+function isExecutionAliasCandidateForProvider(
+  candidate: string | null | undefined,
+  provider: string,
+): boolean {
+  const candidateProvider = modelRefProvider(candidate);
+  return Boolean(
+    candidateProvider &&
+    candidateProvider !== normalizeMediaProviderId(candidateProvider) &&
+    normalizeMediaProviderId(candidateProvider) === normalizeMediaProviderId(provider),
+  );
+}
+
+function isCanonicalCandidateShadowedByExecutionAlias(
+  candidate: string | null | undefined,
+  candidates: readonly (string | null | undefined)[],
+): boolean {
+  const candidateProvider = modelRefProvider(candidate);
+  if (!candidateProvider || candidateProvider !== normalizeMediaProviderId(candidateProvider)) {
+    return false;
+  }
+  if (!isMinimaxVlmProvider(candidateProvider)) {
+    return false;
+  }
+  return candidates.some((shadowCandidate) =>
+    isExecutionAliasCandidateForProvider(shadowCandidate, candidateProvider),
+  );
+}
+
 export const __testing = {
   decodeDataUrl,
   coerceImageAssistantText,
@@ -148,6 +192,7 @@ export function resolveImageModelConfigForTool(params: {
       workspaceDir: params.workspaceDir,
       providerId: primary.provider,
       capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
     });
     if (providerDefault) {
       return [`${primary.provider}/${providerDefault}`];
@@ -158,7 +203,7 @@ export function resolveImageModelConfigForTool(params: {
     return [];
   })();
 
-  const autoCandidates = imageToolProviderDeps
+  const rawAutoCandidates = imageToolProviderDeps
     .resolveAutoMediaKeyProviders({
       cfg: params.cfg,
       workspaceDir: params.workspaceDir,
@@ -170,15 +215,33 @@ export function resolveImageModelConfigForTool(params: {
         workspaceDir: params.workspaceDir,
         providerId,
         capability: "image",
+        includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
       });
       return modelId ? `${providerId}/${modelId}` : null;
     });
+  const autoCandidates = rawAutoCandidates.filter(
+    (candidate) =>
+      !isCanonicalCandidateShadowedByExecutionAlias(candidate, [
+        ...primaryCandidates,
+        ...rawAutoCandidates,
+      ]),
+  );
+  const defaultPrimaryIsImplicit = !hasExplicitDefaultPrimaryModel(params.cfg);
+  const primaryAliasCandidates = defaultPrimaryIsImplicit
+    ? autoCandidates.filter((candidate) =>
+        isExecutionAliasCandidateForProvider(candidate, primary.provider),
+      )
+    : [];
+  const remainingAutoCandidates =
+    primaryAliasCandidates.length === 0
+      ? autoCandidates
+      : autoCandidates.filter((candidate) => !primaryAliasCandidates.includes(candidate));
 
   return buildToolModelConfigFromCandidates({
     explicit,
     agentDir: params.agentDir,
     authStore: params.authStore,
-    candidates: [...primaryCandidates, ...autoCandidates],
+    candidates: [...primaryAliasCandidates, ...primaryCandidates, ...remainingAutoCandidates],
   });
 }
 
diff --git a/src/agents/tools/pdf-tool.model-config.test.ts b/src/agents/tools/pdf-tool.model-config.test.ts
index 125425f3e14..e9bdad13a49 100644
--- a/src/agents/tools/pdf-tool.model-config.test.ts
+++ b/src/agents/tools/pdf-tool.model-config.test.ts
@@ -28,6 +28,9 @@ vi.mock("./model-config.helpers.js", () => ({
     if (provider === "google") {
       return Boolean(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
     }
+    if (provider === "minimax" || provider === "minimax-cn") {
+      return Boolean(process.env.MINIMAX_API_KEY);
+    }
     return false;
   },
   resolveDefaultModelRef: (cfg?: OpenClawConfig) => {
@@ -105,4 +108,33 @@ describe("resolvePdfModelConfigForTool", () => {
       ANTHROPIC_PDF_MODEL,
     );
   });
+
+  it("does not add configured MiniMax chat models as automatic PDF image fallbacks", () => {
+    vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
+    const cfg = {
+      ...withDefaultModel("openai/gpt-5.4"),
+      models: {
+        providers: {
+          minimax: {
+            baseUrl: "https://api.minimax.io/anthropic",
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                name: "MiniMax M2.7",
+                reasoning: false,
+                input: ["text", "image"],
+                cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                contextWindow: 128_000,
+                maxTokens: 8_192,
+              },
+            ],
+          },
+        },
+      },
+    } as OpenClawConfig;
+
+    expect(resolvePdfModelConfigForTool({ cfg, agentDir: TEST_AGENT_DIR })).toEqual({
+      primary: "minimax/MiniMax-VL-01",
+    });
+  });
 });
diff --git a/src/agents/tools/pdf-tool.model-config.ts b/src/agents/tools/pdf-tool.model-config.ts
index 8a32a1da401..da9dc09cd77 100644
--- a/src/agents/tools/pdf-tool.model-config.ts
+++ b/src/agents/tools/pdf-tool.model-config.ts
@@ -5,6 +5,7 @@ import {
   resolveDefaultMediaModel,
 } from "../../media-understanding/defaults.js";
 import type { AuthProfileStore } from "../auth-profiles/types.js";
+import { isMinimaxVlmProvider } from "../minimax-vlm.js";
 import {
   coerceImageModelConfig,
   type ImageModelConfig,
@@ -45,6 +46,7 @@ function resolveImageCandidateRefs(params: {
           workspaceDir: params.workspaceDir,
           providerId,
           capability: "image",
+          includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
         });
       return modelId ? `${providerId}/${modelId}` : null;
     })
@@ -106,6 +108,7 @@ export function resolvePdfModelConfigForTool(params: {
       workspaceDir: params.workspaceDir,
       providerId: primary.provider,
       capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
     });
   const primarySupportsNativePdf = providerSupportsNativePdfDocument({
     cfg: params.cfg,
@@ -136,6 +139,7 @@ export function resolvePdfModelConfigForTool(params: {
       const providerId = providerKey.trim();
       if (
         !providerId ||
+        isMinimaxVlmProvider(providerId) ||
         !hasAuthForProvider({
           provider: providerId,
           agentDir: params.agentDir,
diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts
index 5ad614a279d..e5507b13c7e 100644
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -1125,6 +1125,26 @@ describe("capability cli", () => {
     expect(outputs[0]?.kind).toBe("image.description");
   });
 
+  it("keeps image describe HTTP URLs as URLs", async () => {
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "image",
+        "describe",
+        "--file",
+        "https://httpbin.org/image/png",
+        "--json",
+      ],
+    });
+
+    const describeCall = imageDescribeCall();
+    expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
+    const output = firstJsonOutput();
+    const outputs = output?.outputs as Array<Record<string, unknown>>;
+    expect(outputs[0]?.path).toBe("https://httpbin.org/image/png");
+  });
+
   it("passes image describe prompts through media understanding", async () => {
     await runRegisteredCli({
       register: registerCapabilityCli as (program: Command) => void,
@@ -1221,6 +1241,28 @@ describe("capability cli", () => {
     expect(outputs[0]?.path).toBe("https://example.com/photo.png");
   });
 
+  it("keeps explicit-model image describe HTTP URLs as URLs", async () => {
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "image",
+        "describe",
+        "--file",
+        "https://httpbin.org/image/png",
+        "--model",
+        "minimax-cn/MiniMax-VL-01",
+        "--json",
+      ],
+    });
+
+    const describeCall = firstImageDescribeWithModelCall();
+    expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
+    expect(describeCall?.provider).toBe("minimax-cn");
+    expect(describeCall?.model).toBe("MiniMax-VL-01");
+    expect(mocks.describeImageFile).not.toHaveBeenCalled();
+  });
+
   it("passes describe-many prompts to each image", async () => {
     await runRegisteredCli({
       register: registerCapabilityCli as (program: Command) => void,
diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts
index 8fa940437fb..ec2020ce8d3 100644
--- a/src/cli/capability-cli.ts
+++ b/src/cli/capability-cli.ts
@@ -1097,8 +1097,8 @@ async function runImageDescribe(params: {
   const prompt = normalizeOptionalString(params.prompt);
   const outputs = await Promise.all(
     params.files.map(async (filePath) => {
-      const isRemoteUrl = /^https?:\/\//i.test(filePath.trim());
-      const resolvedPath = isRemoteUrl ? filePath.trim() : path.resolve(filePath);
+      const resolvedPath = resolveImageDescribeInput(filePath);
+      const isRemoteUrl = /^https?:\/\//i.test(resolvedPath);
       const result = activeModel
         ? await describeImageFileWithModel({
             filePath: resolvedPath,
@@ -1513,6 +1513,11 @@ async function runTtsProviders(transport: CapabilityTransport) {
   };
 }
 
+function resolveImageDescribeInput(filePath: string): string {
+  const trimmed = filePath.trim();
+  return /^https?:\/\//i.test(trimmed) ? trimmed : path.resolve(filePath);
+}
+
 async function runTtsPersonas(transport: CapabilityTransport) {
   if (transport === "gateway") {
     return await callGateway({
diff --git a/src/media-understanding/attachments.cache.ts b/src/media-understanding/attachments.cache.ts
index 037795494f7..6f6ac570701 100644
--- a/src/media-understanding/attachments.cache.ts
+++ b/src/media-understanding/attachments.cache.ts
@@ -54,6 +54,14 @@ type AttachmentCacheEntry = {
 
 let defaultLocalPathRoots: readonly string[] | undefined;
 
+function concreteMime(mime: string | undefined): string | undefined {
+  const normalized = mime?.trim();
+  if (!normalized || normalized.endsWith("/*")) {
+    return undefined;
+  }
+  return normalized;
+}
+
 function getDefaultLocalPathRoots(): readonly string[] {
   defaultLocalPathRoots ??= mergeInboundPathRoots(getDefaultMediaLocalRoots());
   return defaultLocalPathRoots;
@@ -128,7 +136,7 @@ export class MediaAttachmentCache {
           entry.buffer = buffer;
           entry.bufferMime =
             entry.bufferMime ??
-            entry.attachment.mime ??
+            concreteMime(entry.attachment.mime) ??
             (await detectMime({
               buffer,
               filePath,
@@ -169,7 +177,7 @@ export class MediaAttachmentCache {
       });
       entry.buffer = fetched.buffer;
       entry.bufferMime =
-        entry.attachment.mime ??
+        concreteMime(entry.attachment.mime) ??
         fetched.contentType ??
         (await detectMime({
           buffer: fetched.buffer,
diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts
index f329d8b0c0f..6dad50f5c78 100644
--- a/src/media-understanding/defaults.test.ts
+++ b/src/media-understanding/defaults.test.ts
@@ -140,6 +140,30 @@ describe("resolveDefaultMediaModel", () => {
       "kimi-k2.6",
     );
   });
+
+  it("prefers configured image models before manifest defaults", () => {
+    const cfg = {
+      models: {
+        providers: {
+          openrouter: {
+            models: [{ id: "google/gemini-2.5-flash", input: ["text", "image"] }],
+          },
+        },
+      },
+    } as never;
+
+    expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image", cfg })).toBe(
+      "google/gemini-2.5-flash",
+    );
+    expect(
+      resolveDefaultMediaModel({
+        providerId: "openrouter",
+        capability: "image",
+        cfg,
+        includeConfiguredImageModels: false,
+      }),
+    ).toBe("auto");
+  });
 });
 
 describe("resolveAutoMediaKeyProviders", () => {
@@ -166,6 +190,36 @@ describe("resolveAutoMediaKeyProviders", () => {
     ]);
   });
 
+  it("preserves configured MiniMax CN aliases for image auto discovery", () => {
+    const providers = resolveAutoMediaKeyProviders({
+      capability: "image",
+      cfg: {
+        models: {
+          providers: {
+            "minimax-cn": {
+              models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
+            },
+            "minimax-portal-cn": {
+              models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
+            },
+            gemini: {
+              models: [{ id: "gemini-3-flash-preview", input: ["text", "image"] }],
+            },
+          },
+        },
+      } as never,
+    });
+
+    expect(providers).toContain("minimax-cn");
+    expect(providers).toContain("minimax-portal-cn");
+    expect(providers).not.toContain("gemini");
+    expect(providers).toContain("google");
+    expect(providers.indexOf("minimax-cn")).toBeLessThan(providers.indexOf("minimax"));
+    expect(providers.indexOf("minimax-portal-cn")).toBeLessThan(
+      providers.indexOf("minimax-portal"),
+    );
+  });
+
   it("keeps the bundled video fallback order", () => {
     expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual([
       "google",
diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts
index e101a3ffb55..e310ab3df3b 100644
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -2,7 +2,10 @@ import { resolveRuntimeConfigCacheKey } from "../config/runtime-snapshot.js";
 import type { OpenClawConfig } from "../config/types.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
 import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js";
-import { normalizeMediaProviderId } from "./provider-registry.js";
+import {
+  normalizeMediaExecutionProviderId,
+  normalizeMediaProviderId,
+} from "./provider-registry.js";
 import { providerSupportsCapability } from "./provider-supports.js";
 import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
 export {
@@ -65,11 +68,11 @@ function resolveConfiguredImageProviderModel(params: {
   cfg?: OpenClawConfig;
   providerId: string;
 }): string | undefined {
+  const normalizedProviderId = normalizeMediaProviderId(params.providerId);
   const providers = params.cfg?.models?.providers;
   if (!providers || typeof providers !== "object") {
     return undefined;
   }
-  const normalizedProviderId = normalizeMediaProviderId(params.providerId);
   for (const [providerKey, providerCfg] of Object.entries(providers)) {
     if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) {
       continue;
@@ -93,7 +96,7 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
   }
   const configured: string[] = [];
   for (const [providerKey, providerCfg] of Object.entries(providers)) {
-    const normalizedProviderId = normalizeMediaProviderId(providerKey);
+    const normalizedProviderId = normalizeMediaExecutionProviderId(providerKey);
     if (!normalizedProviderId || configured.includes(normalizedProviderId)) {
       continue;
     }
@@ -108,14 +111,39 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
   return configured;
 }
 
+function isExecutionAliasProvider(providerId: string): boolean {
+  return normalizeMediaProviderId(providerId) !== providerId;
+}
+
+function insertConfiguredImageProviders(params: {
+  prioritized: string[];
+  configured: string[];
+}): string[] {
+  const merged = [...params.prioritized];
+  for (const providerId of params.configured.filter(isExecutionAliasProvider)) {
+    const canonicalProviderId = normalizeMediaProviderId(providerId);
+    const canonicalIndex = merged.indexOf(canonicalProviderId);
+    if (canonicalIndex >= 0) {
+      merged.splice(canonicalIndex, 0, providerId);
+    } else {
+      merged.unshift(providerId);
+    }
+  }
+  for (const providerId of params.configured.filter((id) => !isExecutionAliasProvider(id))) {
+    merged.push(providerId);
+  }
+  return [...new Set(merged)];
+}
+
 export function resolveDefaultMediaModel(params: {
   providerId: string;
   capability: MediaUnderstandingCapability;
   cfg?: OpenClawConfig;
   workspaceDir?: string;
   providerRegistry?: Map<string, MediaUnderstandingProvider>;
+  includeConfiguredImageModels?: boolean;
 }): string | undefined {
-  if (!params.providerRegistry) {
+  if (!params.providerRegistry && params.includeConfiguredImageModels !== false) {
     const configuredImageModel =
       params.capability === "image"
         ? resolveConfiguredImageProviderModel({
@@ -130,7 +158,13 @@ export function resolveDefaultMediaModel(params: {
   const registry =
     params.providerRegistry ?? resolveDefaultRegistry(params.cfg, params.workspaceDir);
   const provider = registry.get(normalizeMediaProviderId(params.providerId));
-  return normalizeOptionalString(provider?.defaultModels?.[params.capability]);
+  const manifestDefaultModel = normalizeOptionalString(
+    provider?.defaultModels?.[params.capability],
+  );
+  if (manifestDefaultModel) {
+    return manifestDefaultModel;
+  }
+  return undefined;
 }
 
 export function resolveAutoMediaKeyProviders(params: {
@@ -165,7 +199,10 @@ export function resolveAutoMediaKeyProviders(params: {
   if (params.providerRegistry || params.capability !== "image") {
     return prioritized;
   }
-  return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])];
+  return insertConfiguredImageProviders({
+    prioritized,
+    configured: resolveConfiguredImageProviderIds(params.cfg),
+  });
 }
 
 export function providerSupportsNativePdfDocument(params: {
diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts
index 943ae2291ac..1ed605222e3 100644
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -335,6 +335,135 @@ describe("describeImageWithModel", () => {
     expect(fetchMock).toHaveBeenCalledOnce();
   });
 
+  it("uses canonical MiniMax CN baseUrl for VLM alias fallback", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              minimax: {
+                apiKey: "minimax-test-key",
+                baseUrl: "https://api.minimaxi.com/anthropic",
+                models: [],
+              },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax",
+      }),
+    );
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
+  it("uses MiniMax CN alias auth when the alias apiKey is a SecretRef", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              "minimax-cn": {
+                apiKey: { source: "file", provider: "default", id: "/providers/minimax-cn/apiKey" },
+                baseUrl: "https://api.minimaxi.com/anthropic",
+                models: [],
+              },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax-cn",
+      }),
+    );
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
+  it("does not inherit global MiniMax baseUrl for CN VLM aliases", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              minimax: { baseUrl: "https://api.minimax.io/anthropic", models: [] },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
   it("carries workspaceDir through image model and stream resolution", async () => {
     discoverModelsMock.mockReturnValue({
       find: vi.fn(() => ({
diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts
index a29439fa663..bada17737ea 100644
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -21,11 +21,13 @@ import {
   coerceImageAssistantText,
   hasImageReasoningOnlyResponse,
 } from "../agents/tools/image-tool.helpers.js";
+import { isSecretRef } from "../config/types.secrets.js";
 import {
   buildCopilotIdeHeaders,
   COPILOT_INTEGRATION_ID,
   resolveCopilotApiToken,
 } from "../plugin-sdk/provider-auth.js";
+import { normalizeMediaProviderId } from "./provider-id.js";
 import type {
   ImageDescriptionRequest,
   ImageDescriptionResult,
@@ -315,6 +317,7 @@ function buildImageRequestHeaders(model: Model<Api>): Record<string, string> | u
 
 async function describeImagesWithMinimax(params: {
   apiKey: string;
+  provider: string;
   modelId: string;
   modelBaseUrl?: string;
   prompt: string;
@@ -329,6 +332,7 @@ async function describeImagesWithMinimax(params: {
         : params.prompt;
     const text = await minimaxUnderstandImage({
       apiKey: params.apiKey,
+      provider: params.provider,
       prompt,
       imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
       modelBaseUrl: params.modelBaseUrl,
@@ -354,9 +358,53 @@ function resolveConfiguredProviderBaseUrl(
   if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
     return direct.baseUrl.trim();
   }
+  const normalizedProvider = normalizeMediaProviderId(provider);
+  const normalized = cfg.models?.providers?.[normalizedProvider];
+  if (typeof normalized?.baseUrl === "string" && normalized.baseUrl.trim()) {
+    if (isMinimaxCnAlias(provider) && !isMinimaxCnBaseUrl(normalized.baseUrl)) {
+      return undefined;
+    }
+    return normalized.baseUrl.trim();
+  }
   return undefined;
 }
 
+function isMinimaxCnAlias(provider: string): boolean {
+  const normalized = provider.trim().toLowerCase();
+  return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
+}
+
+function isMinimaxCnBaseUrl(baseUrl: string): boolean {
+  const trimmed = baseUrl.trim();
+  if (!trimmed) {
+    return false;
+  }
+  try {
+    const parsed = new URL(/^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`);
+    return parsed.hostname.toLowerCase() === "api.minimaxi.com";
+  } catch {
+    return false;
+  }
+}
+
+function hasConfiguredProviderApiKey(
+  cfg: ImageDescriptionRequest["cfg"],
+  provider: string,
+): boolean {
+  const apiKey = cfg.models?.providers?.[provider]?.apiKey;
+  return (typeof apiKey === "string" && apiKey.trim().length > 0) || isSecretRef(apiKey);
+}
+
+function resolveMinimaxVlmAuthProvider(
+  cfg: ImageDescriptionRequest["cfg"],
+  provider: string,
+): string {
+  if (!isMinimaxCnAlias(provider) || hasConfiguredProviderApiKey(cfg, provider)) {
+    return provider;
+  }
+  return normalizeMediaProviderId(provider);
+}
+
 async function resolveMinimaxVlmFallbackRuntime(params: {
   cfg: ImageDescriptionRequest["cfg"];
   agentDir: string;
@@ -365,8 +413,9 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
   profile?: string;
   preferredProfile?: string;
 }): Promise<{ apiKey: string; modelBaseUrl?: string }> {
+  const authProvider = resolveMinimaxVlmAuthProvider(params.cfg, params.provider);
   const auth = await resolveApiKeyForProvider({
-    provider: params.provider,
+    provider: authProvider,
     cfg: params.cfg,
     profileId: params.profile,
     preferredProfile: params.preferredProfile,
@@ -374,7 +423,7 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
     ...(params.workspaceDir ? { workspaceDir: params.workspaceDir } : {}),
   });
   return {
-    apiKey: requireApiKey(auth, params.provider),
+    apiKey: requireApiKey(auth, authProvider),
     modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
   };
 }
@@ -437,6 +486,7 @@ async function describeImagesWithModelInternal(
     const fallback = await resolveMinimaxVlmFallbackRuntime(params);
     return await describeImagesWithMinimax({
       apiKey: fallback.apiKey,
+      provider: params.provider,
       modelId: params.model,
       modelBaseUrl: fallback.modelBaseUrl,
       prompt,
@@ -448,6 +498,7 @@ async function describeImagesWithModelInternal(
   if (isMinimaxVlmModel(model.provider, model.id)) {
     return await describeImagesWithMinimax({
       apiKey,
+      provider: model.provider,
       modelId: model.id,
       modelBaseUrl: model.baseUrl,
       prompt,
diff --git a/src/media-understanding/media-understanding-misc.test.ts b/src/media-understanding/media-understanding-misc.test.ts
index d1ac691576a..1cd9111bd27 100644
--- a/src/media-understanding/media-understanding-misc.test.ts
+++ b/src/media-understanding/media-understanding-misc.test.ts
@@ -107,6 +107,28 @@ describe("media understanding attachments SSRF", () => {
     expect(fetchSpy).toHaveBeenCalledTimes(1);
   });
 
+  it("uses fetched content type instead of wildcard selection hints", async () => {
+    const url = "http://198.18.0.153/image";
+    const fetchSpy = vi.fn().mockResolvedValue(
+      new Response("image", {
+        headers: { "content-type": "image/png" },
+      }),
+    );
+    globalThis.fetch = withFetchPreconnect(fetchSpy);
+    const cache = new MediaAttachmentCache([{ index: 0, url, mime: "image/*" }], {
+      ssrfPolicy: { allowRfc2544BenchmarkRange: true },
+    });
+
+    const result = await cache.getBuffer({
+      attachmentIndex: 0,
+      maxBytes: 1024,
+      timeoutMs: 1000,
+    });
+
+    expect(result.mime).toBe("image/png");
+    expect(result.fileName).toBe("image.png");
+  });
+
   it("reads local attachments inside configured roots", async () => {
     await withLocalAttachmentCache("openclaw-media-cache-allowed-", async ({ cache }) => {
       const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
diff --git a/src/media-understanding/provider-id.ts b/src/media-understanding/provider-id.ts
index 777fbeab7ba..c48152f9bf6 100644
--- a/src/media-understanding/provider-id.ts
+++ b/src/media-understanding/provider-id.ts
@@ -5,5 +5,19 @@ export function normalizeMediaProviderId(id: string): string {
   if (normalized === "gemini") {
     return "google";
   }
+  if (normalized === "minimax-cn") {
+    return "minimax";
+  }
+  if (normalized === "minimax-portal-cn") {
+    return "minimax-portal";
+  }
   return normalized;
 }
+
+export function normalizeMediaExecutionProviderId(id: string): string {
+  const normalized = normalizeProviderId(id);
+  if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
+    return normalized;
+  }
+  return normalizeMediaProviderId(normalized);
+}
diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts
index ac1688fc9a7..9c3c6d9e724 100644
--- a/src/media-understanding/provider-registry.ts
+++ b/src/media-understanding/provider-registry.ts
@@ -41,7 +41,7 @@ function hydrateModelBackedMediaProvider(
   };
 }
 
-export { normalizeMediaProviderId } from "./provider-id.js";
+export { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";
 
 export function buildMediaUnderstandingRegistry(
   overrides?: Record<string, MediaUnderstandingProvider>,
diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts
index eba690b226e..76b6cbfd27d 100644
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -34,6 +34,7 @@ import { MediaUnderstandingSkipError } from "./errors.js";
 import { fileExists } from "./fs.js";
 import { describeImageWithModel } from "./image-runtime.js";
 import { extractGeminiResponse } from "./output-extract.js";
+import { normalizeMediaExecutionProviderId } from "./provider-id.js";
 import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
 import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
 import type {
@@ -566,6 +567,7 @@ export async function runProviderEntry(params: {
     throw new Error(`Provider entry missing provider for ${capability}`);
   }
   const providerId = normalizeMediaProviderId(providerIdRaw);
+  const requestProviderId = normalizeMediaExecutionProviderId(providerIdRaw);
   const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
     capability,
     entry,
@@ -587,13 +589,13 @@ export async function runProviderEntry(params: {
       timeoutMs,
     });
     const requestOverrides = resolveMediaRequestOverrides(params.config);
-    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+    const provider = getMediaUnderstandingProvider(requestProviderId, params.providerRegistry);
     const imageInput = {
       buffer: media.buffer,
       fileName: media.fileName,
       mime: media.mime,
       model: modelId,
-      provider: providerId,
+      provider: requestProviderId,
       prompt: requestOverrides.prompt ?? prompt,
       timeoutMs,
       profile: entry.profile,
@@ -608,7 +610,7 @@ export async function runProviderEntry(params: {
       kind: "image.description",
       attachmentIndex: params.attachmentIndex,
       text: trimOutput(result.text, maxChars),
-      provider: providerId,
+      provider: requestProviderId,
       model: result.model ?? modelId,
     };
   }
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index 61198867d37..a9ceb41682d 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -2,6 +2,7 @@ import { constants as fsConstants } from "node:fs";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
+import { isMinimaxVlmModel, isMinimaxVlmProvider } from "../agents/minimax-vlm.js";
 import { findNormalizedProviderValue } from "../agents/provider-id.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import {
@@ -26,7 +27,7 @@ import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
 import { isMediaUnderstandingSkipError } from "./errors.js";
 import { fileExists } from "./fs.js";
 import { extractGeminiResponse } from "./output-extract.js";
-import { normalizeMediaProviderId } from "./provider-id.js";
+import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";
 import {
   buildMediaUnderstandingRegistry,
   getMediaUnderstandingProvider,
@@ -73,7 +74,7 @@ function resolveLiteralProviderApiKey(
   cfg: OpenClawConfig | undefined,
   providerId: string,
 ): string | null {
-  const value = cfg?.models?.providers?.[providerId]?.apiKey;
+  const value = findNormalizedProviderValue(cfg?.models?.providers, providerId)?.apiKey;
   return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
 }
 
@@ -98,11 +99,14 @@ function resolveConfiguredKeyProviderOrder(params: {
   fallbackProviders: readonly string[];
 }): string[] {
   const configuredProviders = Object.keys(params.cfg.models?.providers ?? {})
-    .map((providerId) => normalizeMediaProviderId(providerId))
+    .map((providerId) => normalizeMediaExecutionProviderId(providerId))
     .filter(Boolean)
     .filter((providerId, index, values) => values.indexOf(providerId) === index)
     .filter((providerId) =>
-      providerSupportsCapability(params.providerRegistry.get(providerId), params.capability),
+      providerSupportsCapability(
+        params.providerRegistry.get(normalizeMediaProviderId(providerId)),
+        params.capability,
+      ),
     );
 
   return [...new Set([...configuredProviders, ...params.fallbackProviders])];
@@ -112,6 +116,9 @@ function resolveConfiguredImageModelId(params: {
   cfg: OpenClawConfig;
   providerId: string;
 }): string | undefined {
+  if (isMinimaxVlmProvider(params.providerId)) {
+    return undefined;
+  }
   const configured = resolveConfiguredImageModel(params);
   const id = configured?.id?.trim();
   return id || undefined;
@@ -145,7 +152,7 @@ function resolveCatalogImageModelId(params: {
 }): string | undefined {
   const matches = params.catalog.filter(
     (entry) =>
-      normalizeMediaProviderId(entry.provider) === params.providerId &&
+      normalizeMediaProviderId(entry.provider) === normalizeMediaProviderId(params.providerId) &&
       params.modelSupportsVision(entry),
   );
   if (matches.length === 0) {
@@ -200,6 +207,12 @@ async function explicitImageModelVisionStatus(params: {
   providerId: string;
   model: string;
 }): Promise<"supported" | "unsupported" | "unknown"> {
+  if (
+    isMinimaxVlmProvider(params.providerId) &&
+    !isMinimaxVlmModel(params.providerId, params.model)
+  ) {
+    return "unsupported";
+  }
   const configured = resolveConfiguredImageModel(params);
   if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
     return "supported";
@@ -231,6 +244,9 @@ async function resolveAutoImageModelId(params: {
       return explicit;
     }
   }
+  if (isMinimaxVlmProvider(params.providerId)) {
+    return "MiniMax-VL-01";
+  }
   const configuredModel = resolveConfiguredImageModelId(params);
   if (configuredModel) {
     return configuredModel;
@@ -736,7 +752,7 @@ async function resolveActiveModelEntry(params: {
   if (!activeProviderRaw) {
     return null;
   }
-  const providerId = normalizeMediaProviderId(activeProviderRaw);
+  const providerId = normalizeMediaExecutionProviderId(activeProviderRaw);
   if (!providerId) {
     return null;
   }
@@ -940,6 +956,7 @@ export async function runCapability(params: {
   if (
     capability === "image" &&
     activeProvider &&
+    !isMinimaxVlmProvider(activeProvider) &&
     !hasExplicitImageUnderstandingConfig({ cfg, config })
   ) {
     const { findModelInCatalog, loadModelCatalog, modelSupportsVision } =
diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts
index 305852fe737..83eb7405ed6 100644
--- a/src/media-understanding/runner.vision-skip.test.ts
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -12,6 +12,7 @@ import { createEmptyPluginRegistry } from "../plugins/registry.js";
 import { setActivePluginRegistry } from "../plugins/runtime.js";
 import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
 import { withMediaFixture } from "./runner.test-utils.js";
+import type { MediaUnderstandingProvider } from "./types.js";
 
 type TestCatalogEntry = {
   id: string;
@@ -273,7 +274,7 @@ describe("runCapability image skip", () => {
           imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
         },
       },
-    } as OpenClawConfig;
+    } as unknown as OpenClawConfig;
 
     await expect(
       resolveAutoImageModel({
@@ -286,13 +287,13 @@ describe("runCapability image skip", () => {
     });
   });
 
-  it("falls back from an active text model to the provider image default", async () => {
+  it("falls back from a MiniMax chat model to the provider image default", async () => {
     catalog = [
       {
         id: "MiniMax-M2.7",
         name: "MiniMax M2.7",
         provider: "minimax-portal",
-        input: ["text"] as const,
+        input: ["text", "image"] as const,
       },
       {
         id: "MiniMax-VL-01",
@@ -302,7 +303,20 @@ describe("runCapability image skip", () => {
       },
     ];
     vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
-    const cfg = {} as OpenClawConfig;
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-portal": {
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                input: ["text", "image"],
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
     const pluginRegistry = createEmptyPluginRegistry();
     pluginRegistry.mediaUnderstandingProviders.push({
       pluginId: "minimax",
@@ -333,6 +347,300 @@ describe("runCapability image skip", () => {
     }
   });
 
+  it("does not native-skip MiniMax chat models that claim image input", async () => {
+    catalog = [
+      {
+        id: "MiniMax-M2.7",
+        name: "MiniMax M2.7",
+        provider: "minimax-portal",
+        input: ["text", "image"] as const,
+      },
+    ];
+    vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-portal": {
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                input: ["text", "image"],
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "minimax",
+      pluginName: "MiniMax Provider",
+      source: "test",
+      provider: {
+        id: "minimax-portal",
+        capabilities: ["image"],
+        defaultModels: { image: "MiniMax-VL-01" },
+        describeImage: async (req) => ({ text: "vlm ok", model: req.model }),
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await withMediaFixture(
+        {
+          filePrefix: "openclaw-minimax-vlm-no-native-skip",
+          extension: "png",
+          mediaType: "image/png",
+          fileContents: Buffer.from("image"),
+        },
+        async ({ ctx, media, cache }) => {
+          const result = await runCapability({
+            capability: "image",
+            cfg,
+            ctx,
+            attachments: cache,
+            media,
+            agentDir: "/tmp",
+            providerRegistry: buildProviderRegistry(undefined, cfg),
+            activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
+          });
+
+          expect(result.decision.outcome).toBe("success");
+          expect(requireCapabilityOutput(result, 0)).toEqual({
+            kind: "image.description",
+            attachmentIndex: 0,
+            provider: "minimax-portal",
+            model: "MiniMax-VL-01",
+            text: "vlm ok",
+          });
+        },
+      );
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
+  it("preserves MiniMax CN aliases from configured provider routing", async () => {
+    const seenProviders: string[] = [];
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-cn": {
+            apiKey: "test-minimax-key",
+            baseUrl: "https://api.minimaxi.com/anthropic",
+            models: [],
+          },
+        },
+      },
+    } as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "minimax",
+      pluginName: "MiniMax Provider",
+      source: "test",
+      provider: {
+        id: "minimax",
+        capabilities: ["image"],
+        defaultModels: { image: "MiniMax-VL-01" },
+        describeImage: async (req) => {
+          seenProviders.push(req.provider);
+          return { text: "cn vlm ok", model: req.model };
+        },
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await withMediaFixture(
+        {
+          filePrefix: "openclaw-minimax-cn-provider",
+          extension: "png",
+          mediaType: "image/png",
+          fileContents: Buffer.from("image"),
+        },
+        async ({ ctx, media, cache }) => {
+          const result = await runCapability({
+            capability: "image",
+            cfg,
+            ctx,
+            attachments: cache,
+            media,
+            agentDir: "/tmp",
+            providerRegistry: buildProviderRegistry(undefined, cfg),
+          });
+
+          expect(result.decision.outcome).toBe("success");
+          expect(seenProviders).toEqual(["minimax-cn"]);
+          expect(requireCapabilityOutput(result, 0)).toEqual({
+            kind: "image.description",
+            attachmentIndex: 0,
+            provider: "minimax-cn",
+            model: "MiniMax-VL-01",
+            text: "cn vlm ok",
+          });
+        },
+      );
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
+  it("keeps MiniMax auto routing on VLM when registry lacks a default model", async () => {
+    let seenModel: string | undefined;
+    await withMediaFixture(
+      {
+        filePrefix: "openclaw-minimax-vlm-default",
+        extension: "png",
+        mediaType: "image/png",
+        fileContents: Buffer.from("image"),
+      },
+      async ({ ctx, media, cache }) => {
+        const cfg = {
+          models: {
+            providers: {
+              minimax: {
+                apiKey: "test-minimax-key",
+                baseUrl: "https://api.minimax.io/anthropic",
+                models: [
+                  {
+                    id: "MiniMax-M2.5",
+                    name: "MiniMax M2.5",
+                    reasoning: false,
+                    input: ["text", "image"],
+                    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                    contextWindow: 128_000,
+                    maxTokens: 8_192,
+                  },
+                ],
+              },
+            },
+          },
+        } as OpenClawConfig;
+
+        const result = await runCapability({
+          capability: "image",
+          cfg,
+          ctx,
+          attachments: cache,
+          media,
+          agentDir: "/tmp",
+          providerRegistry: new Map([
+            [
+              "minimax",
+              {
+                id: "minimax",
+                capabilities: ["image"],
+                describeImage: async (req) => {
+                  seenModel = req.model;
+                  return { text: "vlm ok", model: req.model };
+                },
+              },
+            ],
+          ]),
+        });
+
+        expect(result.decision.outcome).toBe("success");
+        expect(seenModel).toBe("MiniMax-VL-01");
+        expect(requireCapabilityOutput(result, 0)).toMatchObject({
+          provider: "minimax",
+          model: "MiniMax-VL-01",
+          text: "vlm ok",
+        });
+      },
+    );
+  });
+
+  it("keeps non-MiniMax media aliases canonical for image execution", async () => {
+    const seenProviders: string[] = [];
+    const cfg = {
+      tools: {
+        media: {
+          image: {
+            models: [{ provider: "gemini", model: "gemini-3-flash-preview" }],
+          },
+        },
+      },
+    } as OpenClawConfig;
+    const providerRegistry = new Map<string, MediaUnderstandingProvider>([
+      [
+        "google",
+        {
+          id: "google",
+          capabilities: ["image" as const],
+          describeImage: async (req) => {
+            seenProviders.push(req.provider);
+            return { text: "google ok", model: req.model };
+          },
+        },
+      ],
+    ]);
+
+    await withMediaFixture(
+      {
+        filePrefix: "openclaw-gemini-media-alias",
+        extension: "png",
+        mediaType: "image/png",
+        fileContents: Buffer.from("image"),
+      },
+      async ({ ctx, media, cache }) => {
+        const result = await runCapability({
+          capability: "image",
+          cfg,
+          ctx,
+          attachments: cache,
+          media,
+          agentDir: "/tmp",
+          providerRegistry,
+        });
+
+        expect(result.decision.outcome).toBe("success");
+        expect(seenProviders).toEqual(["google"]);
+        expect(requireCapabilityOutput(result, 0)).toEqual({
+          kind: "image.description",
+          attachmentIndex: 0,
+          provider: "google",
+          model: "gemini-3-flash-preview",
+          text: "google ok",
+        });
+      },
+    );
+  });
+
+  it("canonicalizes non-MiniMax active media aliases for auto image resolution", async () => {
+    vi.stubEnv("GEMINI_API_KEY", "test-gemini-key");
+    const cfg = {} as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "google",
+      pluginName: "Google Provider",
+      source: "test",
+      provider: {
+        id: "google",
+        capabilities: ["image"],
+        defaultModels: { image: "gemini-3-flash-preview" },
+        describeImage: async () => ({ text: "ok" }),
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await expect(
+        resolveAutoImageModel({
+          cfg,
+          activeModel: { provider: "gemini", model: "gemini-3-flash-preview" },
+        }),
+      ).resolves.toEqual({
+        provider: "google",
+        model: "gemini-3-flash-preview",
+      });
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
   it("uses active OpenRouter image models for auto image resolution", async () => {
     vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
     const cfg = {} as OpenClawConfig;
diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts
index 8e299f4cce5..3b5347b3e2d 100644
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -67,6 +67,10 @@ describe("media-understanding runtime", () => {
   afterEach(() => {
     mocks.buildProviderRegistry.mockReset();
     mocks.createMediaAttachmentCache.mockReset();
+    mocks.createMediaAttachmentCache.mockReturnValue({
+      cleanup: mocks.cleanup,
+      getBuffer: mocks.getBuffer,
+    });
     mocks.normalizeMediaAttachments.mockReset();
     mocks.normalizeMediaProviderId.mockReset();
     mocks.buildMediaUnderstandingRegistry.mockReset();
@@ -186,6 +190,76 @@ describe("media-understanding runtime", () => {
     expect(mocks.cleanup).toHaveBeenCalledTimes(1);
   });
 
+  it("classifies extensionless remote image URLs before capability filtering", async () => {
+    const output: MediaUnderstandingOutput = {
+      kind: "image.description",
+      attachmentIndex: 0,
+      provider: "vision-plugin",
+      model: "vision-v1",
+      text: "image ok",
+    };
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, url: "https://httpbin.org/image/png", mime: "image/*" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [output],
+    });
+
+    await expect(
+      describeImageFile({
+        filePath: "https://httpbin.org/image/png",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toEqual({
+      text: "image ok",
+      provider: "vision-plugin",
+      model: "vision-v1",
+      output,
+    });
+
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://httpbin.org/image/png",
+      MediaType: "image/*",
+    });
+    expect(requireRunCapabilityRequest()).toMatchObject({
+      ctx: {
+        MediaUrl: "https://httpbin.org/image/png",
+        MediaType: "image/*",
+      },
+    });
+  });
+
+  it("does not force typed remote URLs into the requested capability", async () => {
+    const media = [{ index: 0, url: "https://example.com/clip.mp4", mime: "video/mp4" }];
+    mocks.normalizeMediaAttachments.mockReturnValue(media);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision: { capability: "image", outcome: "skipped", attachments: [] },
+    });
+
+    await expect(
+      describeImageFile({
+        filePath: "https://example.com/clip.mp4",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toMatchObject({
+      text: undefined,
+      output: undefined,
+    });
+
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://example.com/clip.mp4",
+      MediaType: "video/mp4",
+    });
+    expect(requireRunCapabilityRequest()).toMatchObject({
+      capability: "image",
+      ctx: { MediaUrl: "https://example.com/clip.mp4", MediaType: "video/mp4" },
+      media,
+    });
+  });
+
   it("passes workspaceDir through file media understanding requests", async () => {
     const output: MediaUnderstandingOutput = {
       kind: "image.description",
@@ -395,6 +469,7 @@ describe("media-understanding runtime", () => {
     await describeImageFileWithModel({
       filePath: "https://example.com/photo.png",
       mediaUrl: "https://example.com/photo.png",
+      mime: "image/*",
       provider: "zai",
       model: "glm-4.6v",
       prompt: "Describe it",
@@ -412,6 +487,58 @@ describe("media-understanding runtime", () => {
     expect(mocks.cleanup).toHaveBeenCalledTimes(1);
   });
 
+  it("fetches remote explicit image descriptions through the media attachment cache", async () => {
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, url: "https://httpbin.org/image/png", mime: "image/png" },
+    ]);
+    mocks.buildProviderRegistry.mockReturnValue(
+      new Map([["zai", { id: "zai", capabilities: ["image"] }]]),
+    );
+    mocks.getBuffer.mockResolvedValue({
+      buffer: Buffer.from("remote-png"),
+      fileName: "png",
+      mime: "image/png",
+      size: 10,
+    });
+
+    await expect(
+      describeImageFileWithModel({
+        filePath: "https://httpbin.org/image/png",
+        provider: "zai",
+        model: "glm-4.6v",
+        prompt: "Describe it",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+        timeoutMs: 45_000,
+      }),
+    ).resolves.toEqual({ text: "generic image ok", model: "vision" });
+
+    expect(mocks.readLocalFileSafely).not.toHaveBeenCalled();
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://httpbin.org/image/png",
+      MediaType: "image/*",
+    });
+    expect(mocks.createMediaAttachmentCache).toHaveBeenCalledWith(
+      [{ index: 0, url: "https://httpbin.org/image/png", mime: "image/png" }],
+      { ssrfPolicy: undefined },
+    );
+    expect(mocks.getBuffer).toHaveBeenCalledWith({
+      attachmentIndex: 0,
+      maxBytes: 10 * 1024 * 1024,
+      timeoutMs: 45_000,
+    });
+    expect(mocks.describeImageWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        buffer: Buffer.from("remote-png"),
+        fileName: "png",
+        mime: "image/png",
+        provider: "zai",
+        model: "glm-4.6v",
+      }),
+    );
+    expect(mocks.cleanup).toHaveBeenCalledOnce();
+  });
+
   it("routes direct image description through a provider-specific image hook", async () => {
     const describeImage = vi.fn(async () => ({
       text: "image ok",
diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts
index 6af15949417..0eb408fa2cf 100644
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -1,5 +1,7 @@
 import path from "node:path";
+import type { OpenClawConfig } from "../config/types.js";
 import { readLocalFileSafely } from "../infra/fs-safe.js";
+import { kindFromMime, mimeTypeFromFilePath } from "../media/mime.js";
 import { DEFAULT_MAX_BYTES } from "./defaults.constants.js";
 import { describeImageWithModel } from "./image-runtime.js";
 import {
@@ -48,13 +50,61 @@ function resolveDecisionFailureReason(
   return normalizeDecisionReason(findDecisionReason(decision, "failed"));
 }
 
-function buildFileContext(params: { filePath: string; mediaUrl?: string; mime?: string }) {
+function buildFileContext(params: {
+  filePath: string;
+  mediaUrl?: string;
+  mime?: string;
+  capability?: MediaUnderstandingCapability;
+}) {
+  const remoteRef =
+    params.mediaUrl ??
+    (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
+  const extensionMime = remoteRef ? mimeTypeFromFilePath(remoteRef) : undefined;
+  const extensionKind = kindFromMime(extensionMime);
+  const mediaType =
+    params.mime ??
+    (remoteRef && params.capability && extensionKind === params.capability
+      ? `${params.capability}/*`
+      : extensionMime) ??
+    (remoteRef && params.capability ? `${params.capability}/*` : undefined);
+  if (remoteRef) {
+    return {
+      MediaUrl: remoteRef,
+      MediaType: mediaType,
+    };
+  }
   return {
-    ...(params.mediaUrl ? { MediaUrl: params.mediaUrl } : { MediaPath: params.filePath }),
-    MediaType: params.mime,
+    MediaPath: params.filePath,
+    MediaType: mediaType,
   };
 }
 
+function isRemoteMediaReference(value: string): boolean {
+  return /^https?:\/\//i.test(value.trim());
+}
+
+function concreteMime(mime: string | undefined): string | undefined {
+  const normalized = mime?.trim();
+  if (!normalized || normalized.endsWith("/*")) {
+    return undefined;
+  }
+  return normalized;
+}
+
+function resolveFileLocalRoots(filePath: string): string[] | undefined {
+  return isRemoteMediaReference(filePath) ? undefined : [path.dirname(filePath)];
+}
+
+function basenameFromMediaReference(value: string): string {
+  if (isRemoteMediaReference(value)) {
+    try {
+      const url = new URL(value);
+      return path.basename(url.pathname) || "image";
+    } catch {}
+  }
+  return path.basename(value);
+}
+
 function hasStructuredImageInput(input: ExtractStructuredWithModelParams["input"]): boolean {
   return input.some((entry) => entry.type === "image");
 }
@@ -93,7 +143,7 @@ export async function runMediaUnderstandingFile(
           },
         }
       : params.cfg;
-  const ctx = buildFileContext(params);
+  const ctx = buildFileContext({ ...params, capability: params.capability });
   const attachments = normalizeMediaAttachments(ctx);
   if (attachments.length === 0) {
     return {
@@ -114,7 +164,7 @@ export async function runMediaUnderstandingFile(
 
   const providerRegistry = buildProviderRegistry(undefined, cfg);
   const cache = createMediaAttachmentCache(attachments, {
-    localPathRoots: [path.dirname(params.filePath)],
+    localPathRoots: params.mediaUrl ? undefined : resolveFileLocalRoots(params.filePath),
     ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
   });
 
@@ -166,33 +216,18 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
   const timeoutMs = params.timeoutMs ?? 30_000;
   const providerRegistry = buildProviderRegistry(undefined, params.cfg);
   const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
-  let buffer: Buffer;
-  let fileName = path.basename(params.filePath);
-  let mime = params.mime;
-  if (params.mediaUrl) {
-    const cache = createMediaAttachmentCache(normalizeMediaAttachments(buildFileContext(params)), {
-      ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
-    });
-    try {
-      const media = await cache.getBuffer({
-        attachmentIndex: 0,
-        maxBytes: DEFAULT_MAX_BYTES.image,
-        timeoutMs,
-      });
-      buffer = media.buffer;
-      fileName = media.fileName;
-      mime = media.mime;
-    } finally {
-      await cache.cleanup();
-    }
-  } else {
-    buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer;
-  }
+  const image = await readImageDescriptionInput({
+    filePath: params.filePath,
+    mediaUrl: params.mediaUrl,
+    mime: params.mime,
+    cfg: params.cfg,
+    timeoutMs,
+  });
   const describeImage = provider?.describeImage ?? describeImageWithModel;
   return await describeImage({
-    buffer,
-    fileName,
-    mime,
+    buffer: image.buffer,
+    fileName: image.fileName,
+    mime: image.mime,
     provider: params.provider,
     model: params.model,
     prompt: params.prompt,
@@ -204,6 +239,45 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
   });
 }
 
+async function readImageDescriptionInput(params: {
+  filePath: string;
+  mediaUrl?: string;
+  mime?: string;
+  cfg: OpenClawConfig;
+  timeoutMs: number;
+}): Promise<{ buffer: Buffer; fileName: string; mime?: string }> {
+  const remoteRef =
+    params.mediaUrl ??
+    (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
+  if (!remoteRef) {
+    return {
+      buffer: (await readLocalFileSafely({ filePath: params.filePath })).buffer,
+      fileName: basenameFromMediaReference(params.filePath),
+      mime: params.mime,
+    };
+  }
+  const attachments = normalizeMediaAttachments(
+    buildFileContext({ ...params, capability: "image" }),
+  );
+  const cache = createMediaAttachmentCache(attachments, {
+    ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
+  });
+  try {
+    const media = await cache.getBuffer({
+      attachmentIndex: 0,
+      maxBytes: DEFAULT_MAX_BYTES.image,
+      timeoutMs: params.timeoutMs,
+    });
+    return {
+      buffer: media.buffer,
+      fileName: media.fileName || basenameFromMediaReference(remoteRef),
+      mime: concreteMime(params.mime) ?? media.mime,
+    };
+  } finally {
+    await cache.cleanup();
+  }
+}
+
 export async function extractStructuredWithModel(params: ExtractStructuredWithModelParams) {
   const timeoutMs = params.timeoutMs ?? 30_000;
   if (!hasStructuredImageInput(params.input)) {