fix: route image URL describes through MiniMax VLM

Summary: - Preserve HTTP image describe inputs as remote media. - Route MiniMax CN image understanding through MiniMax-VL-01. - Cover CLI, media runtime, tools, Telegram stickers, docs, and changelog. Verification: - codex-review clean - pnpm check:changed via Blacksmith Testbox tbx_01krtdekwak0mygxbw5z7cfb6z - PR CI green on 516281448e
2026-05-18 15:14:45 +00:00 · 2026-05-17 08:45:50 +01:00
parent 9a36e897be
commit 5d1f7bf058
27 changed files with 1425 additions and 77 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -85,6 +85,7 @@ Docs: https://docs.openclaw.ai
 - Agents/followups: route queued followup turns through CLI runtime backends instead of embedded harness lookup, preventing `claude-cli`/`google-gemini-cli` followups from failing before delivery. Fixes #82847. (#82857) Thanks @hclsys.
 - CLI/sessions: let `openclaw sessions cleanup --fix-missing` prune malformed rows with unresolvable transcript metadata instead of throwing. Fixes #80970. (#82745) Thanks @IWhatsskill.
 - Gateway/usage: refresh large session usage summaries in the background and reuse durable transcript metadata so `sessions.usage` no longer blocks Gateway requests on full transcript rescans. Fixes #82773. (#82778) Thanks @hclsys.
+- CLI/MiniMax media: let `openclaw infer image describe --file` accept HTTP(S) image URLs without treating them as local paths, and keep automatic MiniMax image understanding routed through `MiniMax-VL-01` even when legacy MiniMax M2.x chat metadata claims image input. Fixes #82837. Thanks @mGaolin.
 - TUI: restore the submitted draft when chat is busy instead of clearing it or queueing another run. Fixes #45326. (#82774) Thanks @hyspacex.
 - Cron/memory: treat claimed `before_agent_reply` cron hooks as execution progress, so long memory dreaming promotion jobs are not aborted by the isolated-run pre-execution watchdog. Fixes #82811.
 - Discord: recover transcript-backed full answers when progress-mode final payloads are ellipsis-truncated, so long replies fall back to normal chunked delivery instead of replacing the preview with a shortened message. Fixes #82807. Thanks @blueberry6401.
--- a/docs/cli/infer.md
+++ b/docs/cli/infer.md
@@ -107,19 +107,19 @@ runtime before the provider request is made.

 This table maps common inference tasks to the corresponding infer command.

-| Task                         | Command                                                                                       | Notes                                                 |
-| ---------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
-| Run a text/model prompt      | `openclaw infer model run --prompt "..." --json`                                              | Uses the normal local path by default                 |
-| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs             |
-| Generate an image            | `openclaw infer image generate --prompt "..." --json`                                         | Use `image edit` when starting from an existing file  |
-| Describe an image file       | `openclaw infer image describe --file ./image.png --prompt "..." --json`                      | `--model` must be an image-capable `<provider/model>` |
-| Transcribe audio             | `openclaw infer audio transcribe --file ./memo.m4a --json`                                    | `--model` must be `<provider/model>`                  |
-| Synthesize speech            | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`                        | `tts status` is gateway-oriented                      |
-| Generate a video             | `openclaw infer video generate --prompt "..." --json`                                         | Supports provider hints such as `--resolution`        |
-| Describe a video file        | `openclaw infer video describe --file ./clip.mp4 --json`                                      | `--model` must be `<provider/model>`                  |
-| Search the web               | `openclaw infer web search --query "..." --json`                                              |                                                       |
-| Fetch a web page             | `openclaw infer web fetch --url https://example.com --json`                                   |                                                       |
-| Create embeddings            | `openclaw infer embedding create --text "..." --json`                                         |                                                       |
+| Task                          | Command                                                                                       | Notes                                                 |
+| ----------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| Run a text/model prompt       | `openclaw infer model run --prompt "..." --json`                                              | Uses the normal local path by default                 |
+| Run a model prompt on images  | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs             |
+| Generate an image             | `openclaw infer image generate --prompt "..." --json`                                         | Use `image edit` when starting from an existing file  |
+| Describe an image file or URL | `openclaw infer image describe --file ./image.png --prompt "..." --json`                      | `--model` must be an image-capable `<provider/model>` |
+| Transcribe audio              | `openclaw infer audio transcribe --file ./memo.m4a --json`                                    | `--model` must be `<provider/model>`                  |
+| Synthesize speech             | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`                        | `tts status` is gateway-oriented                      |
+| Generate a video              | `openclaw infer video generate --prompt "..." --json`                                         | Supports provider hints such as `--resolution`        |
+| Describe a video file         | `openclaw infer video describe --file ./clip.mp4 --json`                                      | `--model` must be `<provider/model>`                  |
+| Search the web                | `openclaw infer web search --query "..." --json`                                              |                                                       |
+| Fetch a web page              | `openclaw infer web fetch --url https://example.com --json`                                   |                                                       |
+| Create embeddings             | `openclaw infer embedding create --text "..." --json`                                         |                                                       |

 ## Behavior

@@ -128,6 +128,7 @@ This table maps common inference tasks to the corresponding infer command.
 - Use `--provider` or `--model provider/model` when a specific backend is required.
 - Use `model run --thinking <level>` to pass a one-shot thinking/reasoning level (`off`, `minimal`, `low`, `medium`, `high`, `adaptive`, `xhigh`, or `max`) while keeping the run raw.
 - For `image describe`, `audio transcribe`, and `video describe`, `--model` must use the form `<provider/model>`.
+- For `image describe`, `--file` accepts local paths and HTTP(S) image URLs. Remote URLs use the normal media-fetch SSRF policy.
 - For `image describe`, an explicit `--model` runs that provider/model directly. The model must be image-capable in the model catalog or provider config. `codex/<model>` runs a bounded Codex app-server image-understanding turn; `openai-codex/<model>` uses the OpenAI Codex OAuth provider path.
 - Stateless execution commands default to local.
 - Gateway-managed state commands default to gateway.
@@ -192,6 +193,7 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000
 openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json
 openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json
 openclaw infer image describe --file ./photo.jpg --json
+openclaw infer image describe --file https://example.com/photo.png --json
 openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json
 openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json
 openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -260,8 +260,8 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
 <Note>
 **MiniMax note**

- `minimax` and `minimax-portal` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
- The bundled MiniMax text catalog still starts text-only; explicit `models.providers.minimax` entries materialize image-capable M2.7 chat refs.
+- `minimax`, `minimax-cn`, `minimax-portal`, and `minimax-portal-cn` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
+- Automatic image routing keeps using `MiniMax-VL-01` even if legacy MiniMax M2.x chat metadata claims image input.

 </Note>

--- a/extensions/telegram/src/sticker-cache.describe.test.ts
+++ b/extensions/telegram/src/sticker-cache.describe.test.ts
@@ -0,0 +1,125 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { describeStickerImage } from "./sticker-cache.js";
+
+const mocks = vi.hoisted(() => {
+  const describeImageFileWithModel = vi.fn(async () => ({
+    text: "vlm ok",
+    model: "MiniMax-VL-01",
+  }));
+  return {
+    describeImageFileWithModel,
+    findModelInCatalog: vi.fn((_catalog, provider: string, model: string) => ({
+      provider,
+      id: model,
+      input: ["text", "image"],
+    })),
+    loadModelCatalog: vi.fn(async () => [
+      { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
+      { provider: "minimax", id: "MiniMax-M2.7", input: ["text", "image"] },
+    ]),
+    modelSupportsVision: vi.fn((entry: { input?: string[] } | undefined) =>
+      Boolean(entry?.input?.includes("image")),
+    ),
+    resolveApiKeyForProvider: vi.fn(async () => ({ apiKey: "minimax-test" })),
+    resolveAutoImageModel: vi.fn(async () => ({
+      provider: "minimax-cn",
+      model: "MiniMax-VL-01",
+    })),
+    resolveAutoMediaKeyProviders: vi.fn(() => ["minimax-cn", "minimax"]),
+    resolveDefaultMediaModel: vi.fn(() => "MiniMax-VL-01"),
+    resolveDefaultModelForAgent: vi.fn(() => ({
+      provider: "minimax-cn",
+      model: "MiniMax-M2.7",
+    })),
+  };
+});
+
+vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({
+  findModelInCatalog: mocks.findModelInCatalog,
+  loadModelCatalog: mocks.loadModelCatalog,
+  modelSupportsVision: mocks.modelSupportsVision,
+  resolveApiKeyForProvider: mocks.resolveApiKeyForProvider,
+  resolveDefaultModelForAgent: mocks.resolveDefaultModelForAgent,
+}));
+
+vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
+  resolveAutoImageModel: mocks.resolveAutoImageModel,
+  resolveAutoMediaKeyProviders: mocks.resolveAutoMediaKeyProviders,
+  resolveDefaultMediaModel: mocks.resolveDefaultMediaModel,
+}));
+
+vi.mock("./runtime.js", () => ({
+  getTelegramRuntime: () => ({
+    mediaUnderstanding: {
+      describeImageFileWithModel: mocks.describeImageFileWithModel,
+    },
+  }),
+}));
+
+describe("describeStickerImage", () => {
+  beforeEach(() => {
+    mocks.describeImageFileWithModel.mockClear();
+    mocks.findModelInCatalog.mockClear();
+    mocks.loadModelCatalog.mockClear();
+    mocks.modelSupportsVision.mockClear();
+    mocks.resolveApiKeyForProvider.mockClear();
+    mocks.resolveAutoImageModel.mockClear();
+    mocks.resolveAutoMediaKeyProviders.mockClear();
+    mocks.resolveDefaultMediaModel.mockClear();
+    mocks.resolveDefaultModelForAgent.mockClear();
+  });
+
+  it("uses MiniMax VLM auto selection instead of legacy chat vision catalog entries", async () => {
+    await expect(
+      describeStickerImage({
+        imagePath: "/tmp/sticker.webp",
+        cfg: {},
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toBe("vlm ok");
+
+    expect(mocks.resolveDefaultMediaModel).toHaveBeenCalledWith({
+      cfg: {},
+      providerId: "minimax-cn",
+      capability: "image",
+      includeConfiguredImageModels: false,
+    });
+    expect(mocks.resolveAutoImageModel).not.toHaveBeenCalled();
+    expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        filePath: "/tmp/sticker.webp",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+      }),
+    );
+  });
+
+  it("keeps MiniMax chat defaults on MiniMax VLM when other vision providers are configured", async () => {
+    mocks.resolveAutoMediaKeyProviders.mockReturnValue(["openai", "minimax-cn", "minimax"]);
+    mocks.loadModelCatalog.mockResolvedValue([
+      { provider: "openai", id: "gpt-5.4", input: ["text", "image"] },
+      { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
+      { provider: "minimax-cn", id: "MiniMax-VL-01", input: ["image"] },
+    ]);
+
+    await expect(
+      describeStickerImage({
+        imagePath: "/tmp/sticker.webp",
+        cfg: {},
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toBe("vlm ok");
+
+    expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+      }),
+    );
+    expect(mocks.describeImageFileWithModel).not.toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "openai",
+      }),
+    );
+  });
+});
--- a/extensions/telegram/src/sticker-cache.ts
+++ b/extensions/telegram/src/sticker-cache.ts
@@ -27,6 +27,16 @@ export {
 const STICKER_DESCRIPTION_PROMPT =
  "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";

+function isMinimaxVlmProvider(provider: string): boolean {
+  const normalized = normalizeLowercaseStringOrEmpty(provider);
+  return (
+    normalized === "minimax" ||
+    normalized === "minimax-cn" ||
+    normalized === "minimax-portal" ||
+    normalized === "minimax-portal-cn"
+  );
+}
+
 export interface DescribeStickerParams {
  imagePath: string;
  cfg: OpenClawConfig;
@@ -50,7 +60,17 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
    const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
    const supportsVision = modelSupportsVision(entry);
    if (supportsVision) {
-      activeModel = { provider: defaultModel.provider, model: defaultModel.model };
+      const model = isMinimaxVlmProvider(defaultModel.provider)
+        ? resolveDefaultMediaModel({
+            cfg,
+            providerId: defaultModel.provider,
+            capability: "image",
+            includeConfiguredImageModels: false,
+          })
+        : defaultModel.model;
+      if (model) {
+        activeModel = { provider: defaultModel.provider, model };
+      }
    }
  } catch {
    // Ignore catalog failures; fall back to auto selection.
@@ -83,8 +103,12 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
      cfg,
      providerId: provider,
      capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(provider),
    });
    const preferred = entries.find((entry) => entry.id === defaultId);
+    if (isMinimaxVlmProvider(provider)) {
+      return preferred;
+    }
    return preferred ?? entries[0];
  };

--- a/src/agents/minimax-vlm.normalizes-api-key.test.ts
+++ b/src/agents/minimax-vlm.normalizes-api-key.test.ts
@@ -75,6 +75,61 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
    expect(fetchSpy).toHaveBeenCalledOnce();
  });

+  it.each(["minimax-cn", "minimax-portal-cn"])(
+    "routes %s to the CN VLM host by default",
+    async (provider) => {
+      const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
+        const requestUrl =
+          typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
+        expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+        return new Response(apiResponse, {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        });
+      });
+      global.fetch = withFetchPreconnect(fetchSpy);
+
+      await expect(
+        minimaxUnderstandImage({
+          apiKey: "minimax-test-key",
+          provider,
+          prompt: "hi",
+          imageDataUrl: "data:image/png;base64,AAAA",
+        }),
+      ).resolves.toBe("ok");
+
+      expect(fetchSpy).toHaveBeenCalledOnce();
+    },
+  );
+
+  it.each(["minimax-cn", "minimax-portal-cn"])(
+    "keeps %s on the CN VLM host when the configured host is malformed",
+    async (provider) => {
+      const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
+        const requestUrl =
+          typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
+        expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+        return new Response(apiResponse, {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        });
+      });
+      global.fetch = withFetchPreconnect(fetchSpy);
+
+      await expect(
+        minimaxUnderstandImage({
+          apiKey: "minimax-test-key",
+          provider,
+          apiHost: "https://[",
+          prompt: "hi",
+          imageDataUrl: "data:image/png;base64,AAAA",
+        }),
+      ).resolves.toBe("ok");
+
+      expect(fetchSpy).toHaveBeenCalledOnce();
+    },
+  );
+
  it("uses the caller-provided request timeout", async () => {
    const timeoutSpy = vi.spyOn(AbortSignal, "timeout");
    const fetchSpy = vi.fn(async () => {
@@ -103,7 +158,9 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
 describe("isMinimaxVlmModel", () => {
  it("only matches the canonical MiniMax VLM model id", () => {
    expect(isMinimaxVlmModel("minimax", "MiniMax-VL-01")).toBe(true);
+    expect(isMinimaxVlmModel("minimax-cn", "MiniMax-VL-01")).toBe(true);
    expect(isMinimaxVlmModel("minimax-portal", "MiniMax-VL-01")).toBe(true);
+    expect(isMinimaxVlmModel("minimax-portal-cn", "MiniMax-VL-01")).toBe(true);
    expect(isMinimaxVlmModel("minimax-portal", "custom-vision")).toBe(false);
    expect(isMinimaxVlmModel("openai", "MiniMax-VL-01")).toBe(false);
  });
--- a/src/agents/minimax-vlm.ts
+++ b/src/agents/minimax-vlm.ts
@@ -8,35 +8,54 @@ type MinimaxBaseResp = {
 };

 export function isMinimaxVlmProvider(provider: string): boolean {
-  return provider === "minimax" || provider === "minimax-portal";
+  const normalized = provider.trim().toLowerCase();
+  return (
+    normalized === "minimax" ||
+    normalized === "minimax-cn" ||
+    normalized === "minimax-portal" ||
+    normalized === "minimax-portal-cn"
+  );
 }

 export function isMinimaxVlmModel(provider: string, modelId: string): boolean {
  return isMinimaxVlmProvider(provider) && modelId.trim() === "MiniMax-VL-01";
 }

+function isMinimaxCnProvider(provider: string | undefined): boolean {
+  const normalized = provider?.trim().toLowerCase();
+  return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
+}
+
 function coerceApiHost(params: {
  apiHost?: string;
  modelBaseUrl?: string;
+  provider?: string;
  env?: NodeJS.ProcessEnv;
 }): string {
  const env = params.env ?? process.env;
+  const defaultHost = isMinimaxCnProvider(params.provider)
+    ? "https://api.minimaxi.com"
+    : "https://api.minimax.io";
  const raw =
    params.apiHost?.trim() ||
    env.MINIMAX_API_HOST?.trim() ||
    params.modelBaseUrl?.trim() ||
-    "https://api.minimax.io";
+    defaultHost;

  try {
    const url = new URL(raw);
    return url.origin;
  } catch {}

+  if (/^[a-z][a-z\d+.-]*:\/\//i.test(raw)) {
+    return defaultHost;
+  }
+
  try {
    const url = new URL(`https://${raw}`);
    return url.origin;
  } catch {
-    return "https://api.minimax.io";
+    return defaultHost;
  }
 }

@@ -51,6 +70,7 @@ export async function minimaxUnderstandImage(params: {
  imageDataUrl: string;
  apiHost?: string;
  modelBaseUrl?: string;
+  provider?: string;
  timeoutMs?: number;
 }): Promise<string> {
  const apiKey = normalizeSecretInput(params.apiKey);
@@ -72,6 +92,7 @@ export async function minimaxUnderstandImage(params: {
  const host = coerceApiHost({
    apiHost: params.apiHost,
    modelBaseUrl: params.modelBaseUrl,
+    provider: params.provider,
  });
  const url = new URL("/v1/coding_plan/vlm", host).toString();

--- a/src/agents/tools/image-tool.helpers.ts
+++ b/src/agents/tools/image-tool.helpers.ts
@@ -2,6 +2,7 @@ import type { AssistantMessage } from "@earendil-works/pi-ai";
 import type { OpenClawConfig } from "../../config/types.openclaw.js";
 import { estimateBase64DecodedBytes } from "../../media/base64.js";
 import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
+import { isMinimaxVlmProvider } from "../minimax-vlm.js";
 import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
 import { extractAssistantText } from "../pi-embedded-utils.js";
 import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";
@@ -238,6 +239,9 @@ export function resolveProviderVisionModelFromConfig(params: {
  cfg?: OpenClawConfig;
  provider: string;
 }): string | null {
+  if (isMinimaxVlmProvider(params.provider)) {
+    return null;
+  }
  const providerCfg = findNormalizedProviderValue(
    params.cfg?.models?.providers,
    params.provider,
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -181,7 +181,9 @@ async function createOpenClawCodingToolsWithFreshModules(options?: CreateOpenCla
  const defaultImageModels = new Map<string, string>([
    ["anthropic", "claude-opus-4-6"],
    ["minimax", "MiniMax-VL-01"],
+    ["minimax-cn", "MiniMax-VL-01"],
    ["minimax-portal", "MiniMax-VL-01"],
+    ["minimax-portal-cn", "MiniMax-VL-01"],
    ["openai", "gpt-5.4-mini"],
    ["opencode", "gpt-5-nano"],
    ["opencode-go", "kimi-k2.6"],
@@ -482,7 +484,9 @@ function installImageUnderstandingProviderStubs(...providers: MediaUnderstanding
  const defaultImageModels = new Map<string, string>([
    ["anthropic", "claude-opus-4-6"],
    ["minimax", "MiniMax-VL-01"],
+    ["minimax-cn", "MiniMax-VL-01"],
    ["minimax-portal", "MiniMax-VL-01"],
+    ["minimax-portal-cn", "MiniMax-VL-01"],
    ["openai", "gpt-5.4-mini"],
    ["opencode", "gpt-5-nano"],
    ["opencode-go", "kimi-k2.6"],
@@ -764,6 +768,127 @@ describe("image tool implicit imageModel config", () => {
    });
  });

+  it("keeps MiniMax CN chat metadata off automatic image routing", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const cfg: OpenClawConfig = {
+        agents: { defaults: { model: { primary: "minimax-cn/MiniMax-M2.5" } } },
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax-cn/MiniMax-VL-01",
+      });
+    });
+  });
+
+  it("prefers configured MiniMax CN image alias over canonical auto fallback", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const defaultImageModels = new Map<string, string>([
+        ["anthropic", "claude-opus-4-6"],
+        ["minimax", "MiniMax-VL-01"],
+        ["minimax-cn", "MiniMax-VL-01"],
+        ["openai", "gpt-5.4-mini"],
+      ]);
+      __testing.setProviderDepsForTest({
+        buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
+          imageProviderHarness.buildProviderRegistry(overrides),
+        getMediaUnderstandingProvider: (
+          id: string,
+          registry: Map<string, MediaUnderstandingProvider>,
+        ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
+        describeImageWithModel: describeGenericImageWithModel,
+        describeImagesWithModel: describeGenericImagesWithModel,
+        resolveAutoMediaKeyProviders: ({ capability }) =>
+          capability === "image" ? ["openai", "anthropic", "minimax-cn", "minimax"] : [],
+        resolveDefaultMediaModel: ({ providerId, capability }) =>
+          capability === "image" ? defaultImageModels.get(providerId.toLowerCase()) : undefined,
+      });
+      const cfg: OpenClawConfig = {
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax-cn/MiniMax-VL-01",
+      });
+    });
+  });
+
+  it("keeps canonical MiniMax fallback when configured CN alias has no image candidate", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      __testing.setProviderDepsForTest({
+        buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
+          imageProviderHarness.buildProviderRegistry(overrides),
+        getMediaUnderstandingProvider: (
+          id: string,
+          registry: Map<string, MediaUnderstandingProvider>,
+        ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
+        describeImageWithModel: describeGenericImageWithModel,
+        describeImagesWithModel: describeGenericImagesWithModel,
+        resolveAutoMediaKeyProviders: ({ capability }) =>
+          capability === "image" ? ["minimax"] : [],
+        resolveDefaultMediaModel: ({ providerId, capability }) =>
+          capability === "image" && providerId === "minimax" ? "MiniMax-VL-01" : undefined,
+      });
+      const cfg: OpenClawConfig = {
+        models: {
+          mode: "merge",
+          providers: {
+            "minimax-cn": {
+              baseUrl: "https://api.minimaxi.com/anthropic",
+              apiKey: "${MINIMAX_API_KEY}",
+              api: "anthropic-messages",
+              models: [],
+            },
+          },
+        },
+      };
+      const authStore = {
+        version: 1,
+        profiles: {
+          miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
+        },
+      } as const;
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
+        primary: "minimax/MiniMax-VL-01",
+      });
+    });
+  });
+
  it("passes the configured image timeout to provider calls", async () => {
    await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
      await withTempAgentDir(async (agentDir) => {
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -68,6 +68,50 @@ const imageToolProviderDeps = {
  resolveDefaultMediaModel,
 };

+function hasExplicitDefaultPrimaryModel(cfg?: OpenClawConfig): boolean {
+  const model = cfg?.agents?.defaults?.model;
+  if (typeof model === "string") {
+    return model.trim().length > 0;
+  }
+  return typeof model?.primary === "string" && model.primary.trim().length > 0;
+}
+
+function modelRefProvider(candidate: string | null | undefined): string | undefined {
+  const trimmed = candidate?.trim();
+  if (!trimmed?.includes("/")) {
+    return undefined;
+  }
+  return trimmed.slice(0, trimmed.indexOf("/")).trim();
+}
+
+function isExecutionAliasCandidateForProvider(
+  candidate: string | null | undefined,
+  provider: string,
+): boolean {
+  const candidateProvider = modelRefProvider(candidate);
+  return Boolean(
+    candidateProvider &&
+    candidateProvider !== normalizeMediaProviderId(candidateProvider) &&
+    normalizeMediaProviderId(candidateProvider) === normalizeMediaProviderId(provider),
+  );
+}
+
+function isCanonicalCandidateShadowedByExecutionAlias(
+  candidate: string | null | undefined,
+  candidates: readonly (string | null | undefined)[],
+): boolean {
+  const candidateProvider = modelRefProvider(candidate);
+  if (!candidateProvider || candidateProvider !== normalizeMediaProviderId(candidateProvider)) {
+    return false;
+  }
+  if (!isMinimaxVlmProvider(candidateProvider)) {
+    return false;
+  }
+  return candidates.some((shadowCandidate) =>
+    isExecutionAliasCandidateForProvider(shadowCandidate, candidateProvider),
+  );
+}
+
 export const __testing = {
  decodeDataUrl,
  coerceImageAssistantText,
@@ -148,6 +192,7 @@ export function resolveImageModelConfigForTool(params: {
      workspaceDir: params.workspaceDir,
      providerId: primary.provider,
      capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
    });
    if (providerDefault) {
      return [`${primary.provider}/${providerDefault}`];
@@ -158,7 +203,7 @@ export function resolveImageModelConfigForTool(params: {
    return [];
  })();

-  const autoCandidates = imageToolProviderDeps
+  const rawAutoCandidates = imageToolProviderDeps
    .resolveAutoMediaKeyProviders({
      cfg: params.cfg,
      workspaceDir: params.workspaceDir,
@@ -170,15 +215,33 @@ export function resolveImageModelConfigForTool(params: {
        workspaceDir: params.workspaceDir,
        providerId,
        capability: "image",
+        includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
      });
      return modelId ? `${providerId}/${modelId}` : null;
    });
+  const autoCandidates = rawAutoCandidates.filter(
+    (candidate) =>
+      !isCanonicalCandidateShadowedByExecutionAlias(candidate, [
+        ...primaryCandidates,
+        ...rawAutoCandidates,
+      ]),
+  );
+  const defaultPrimaryIsImplicit = !hasExplicitDefaultPrimaryModel(params.cfg);
+  const primaryAliasCandidates = defaultPrimaryIsImplicit
+    ? autoCandidates.filter((candidate) =>
+        isExecutionAliasCandidateForProvider(candidate, primary.provider),
+      )
+    : [];
+  const remainingAutoCandidates =
+    primaryAliasCandidates.length === 0
+      ? autoCandidates
+      : autoCandidates.filter((candidate) => !primaryAliasCandidates.includes(candidate));

  return buildToolModelConfigFromCandidates({
    explicit,
    agentDir: params.agentDir,
    authStore: params.authStore,
-    candidates: [...primaryCandidates, ...autoCandidates],
+    candidates: [...primaryAliasCandidates, ...primaryCandidates, ...remainingAutoCandidates],
  });
 }

--- a/src/agents/tools/pdf-tool.model-config.test.ts
+++ b/src/agents/tools/pdf-tool.model-config.test.ts
@@ -28,6 +28,9 @@ vi.mock("./model-config.helpers.js", () => ({
    if (provider === "google") {
      return Boolean(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
    }
+    if (provider === "minimax" || provider === "minimax-cn") {
+      return Boolean(process.env.MINIMAX_API_KEY);
+    }
    return false;
  },
  resolveDefaultModelRef: (cfg?: OpenClawConfig) => {
@@ -105,4 +108,33 @@ describe("resolvePdfModelConfigForTool", () => {
      ANTHROPIC_PDF_MODEL,
    );
  });
+
+  it("does not add configured MiniMax chat models as automatic PDF image fallbacks", () => {
+    vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
+    const cfg = {
+      ...withDefaultModel("openai/gpt-5.4"),
+      models: {
+        providers: {
+          minimax: {
+            baseUrl: "https://api.minimax.io/anthropic",
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                name: "MiniMax M2.7",
+                reasoning: false,
+                input: ["text", "image"],
+                cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                contextWindow: 128_000,
+                maxTokens: 8_192,
+              },
+            ],
+          },
+        },
+      },
+    } as OpenClawConfig;
+
+    expect(resolvePdfModelConfigForTool({ cfg, agentDir: TEST_AGENT_DIR })).toEqual({
+      primary: "minimax/MiniMax-VL-01",
+    });
+  });
 });
--- a/src/agents/tools/pdf-tool.model-config.ts
+++ b/src/agents/tools/pdf-tool.model-config.ts
@@ -5,6 +5,7 @@ import {
  resolveDefaultMediaModel,
 } from "../../media-understanding/defaults.js";
 import type { AuthProfileStore } from "../auth-profiles/types.js";
+import { isMinimaxVlmProvider } from "../minimax-vlm.js";
 import {
  coerceImageModelConfig,
  type ImageModelConfig,
@@ -45,6 +46,7 @@ function resolveImageCandidateRefs(params: {
          workspaceDir: params.workspaceDir,
          providerId,
          capability: "image",
+          includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
        });
      return modelId ? `${providerId}/${modelId}` : null;
    })
@@ -106,6 +108,7 @@ export function resolvePdfModelConfigForTool(params: {
      workspaceDir: params.workspaceDir,
      providerId: primary.provider,
      capability: "image",
+      includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
    });
  const primarySupportsNativePdf = providerSupportsNativePdfDocument({
    cfg: params.cfg,
@@ -136,6 +139,7 @@ export function resolvePdfModelConfigForTool(params: {
      const providerId = providerKey.trim();
      if (
        !providerId ||
+        isMinimaxVlmProvider(providerId) ||
        !hasAuthForProvider({
          provider: providerId,
          agentDir: params.agentDir,
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -1125,6 +1125,26 @@ describe("capability cli", () => {
    expect(outputs[0]?.kind).toBe("image.description");
  });

+  it("keeps image describe HTTP URLs as URLs", async () => {
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "image",
+        "describe",
+        "--file",
+        "https://httpbin.org/image/png",
+        "--json",
+      ],
+    });
+
+    const describeCall = imageDescribeCall();
+    expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
+    const output = firstJsonOutput();
+    const outputs = output?.outputs as Array<Record<string, unknown>>;
+    expect(outputs[0]?.path).toBe("https://httpbin.org/image/png");
+  });
+
  it("passes image describe prompts through media understanding", async () => {
    await runRegisteredCli({
      register: registerCapabilityCli as (program: Command) => void,
@@ -1221,6 +1241,28 @@ describe("capability cli", () => {
    expect(outputs[0]?.path).toBe("https://example.com/photo.png");
  });

+  it("keeps explicit-model image describe HTTP URLs as URLs", async () => {
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "image",
+        "describe",
+        "--file",
+        "https://httpbin.org/image/png",
+        "--model",
+        "minimax-cn/MiniMax-VL-01",
+        "--json",
+      ],
+    });
+
+    const describeCall = firstImageDescribeWithModelCall();
+    expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
+    expect(describeCall?.provider).toBe("minimax-cn");
+    expect(describeCall?.model).toBe("MiniMax-VL-01");
+    expect(mocks.describeImageFile).not.toHaveBeenCalled();
+  });
+
  it("passes describe-many prompts to each image", async () => {
    await runRegisteredCli({
      register: registerCapabilityCli as (program: Command) => void,
--- a/src/cli/capability-cli.ts
+++ b/src/cli/capability-cli.ts
@@ -1097,8 +1097,8 @@ async function runImageDescribe(params: {
  const prompt = normalizeOptionalString(params.prompt);
  const outputs = await Promise.all(
    params.files.map(async (filePath) => {
-      const isRemoteUrl = /^https?:\/\//i.test(filePath.trim());
-      const resolvedPath = isRemoteUrl ? filePath.trim() : path.resolve(filePath);
+      const resolvedPath = resolveImageDescribeInput(filePath);
+      const isRemoteUrl = /^https?:\/\//i.test(resolvedPath);
      const result = activeModel
        ? await describeImageFileWithModel({
            filePath: resolvedPath,
@@ -1513,6 +1513,11 @@ async function runTtsProviders(transport: CapabilityTransport) {
  };
 }

+function resolveImageDescribeInput(filePath: string): string {
+  const trimmed = filePath.trim();
+  return /^https?:\/\//i.test(trimmed) ? trimmed : path.resolve(filePath);
+}
+
 async function runTtsPersonas(transport: CapabilityTransport) {
  if (transport === "gateway") {
    return await callGateway({
--- a/src/media-understanding/attachments.cache.ts
+++ b/src/media-understanding/attachments.cache.ts
@@ -54,6 +54,14 @@ type AttachmentCacheEntry = {

 let defaultLocalPathRoots: readonly string[] | undefined;

+function concreteMime(mime: string | undefined): string | undefined {
+  const normalized = mime?.trim();
+  if (!normalized || normalized.endsWith("/*")) {
+    return undefined;
+  }
+  return normalized;
+}
+
 function getDefaultLocalPathRoots(): readonly string[] {
  defaultLocalPathRoots ??= mergeInboundPathRoots(getDefaultMediaLocalRoots());
  return defaultLocalPathRoots;
@@ -128,7 +136,7 @@ export class MediaAttachmentCache {
          entry.buffer = buffer;
          entry.bufferMime =
            entry.bufferMime ??
-            entry.attachment.mime ??
+            concreteMime(entry.attachment.mime) ??
            (await detectMime({
              buffer,
              filePath,
@@ -169,7 +177,7 @@ export class MediaAttachmentCache {
      });
      entry.buffer = fetched.buffer;
      entry.bufferMime =
-        entry.attachment.mime ??
+        concreteMime(entry.attachment.mime) ??
        fetched.contentType ??
        (await detectMime({
          buffer: fetched.buffer,
--- a/src/media-understanding/defaults.test.ts
+++ b/src/media-understanding/defaults.test.ts
@@ -140,6 +140,30 @@ describe("resolveDefaultMediaModel", () => {
      "kimi-k2.6",
    );
  });
+
+  it("prefers configured image models before manifest defaults", () => {
+    const cfg = {
+      models: {
+        providers: {
+          openrouter: {
+            models: [{ id: "google/gemini-2.5-flash", input: ["text", "image"] }],
+          },
+        },
+      },
+    } as never;
+
+    expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image", cfg })).toBe(
+      "google/gemini-2.5-flash",
+    );
+    expect(
+      resolveDefaultMediaModel({
+        providerId: "openrouter",
+        capability: "image",
+        cfg,
+        includeConfiguredImageModels: false,
+      }),
+    ).toBe("auto");
+  });
 });

 describe("resolveAutoMediaKeyProviders", () => {
@@ -166,6 +190,36 @@ describe("resolveAutoMediaKeyProviders", () => {
    ]);
  });

+  it("preserves configured MiniMax CN aliases for image auto discovery", () => {
+    const providers = resolveAutoMediaKeyProviders({
+      capability: "image",
+      cfg: {
+        models: {
+          providers: {
+            "minimax-cn": {
+              models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
+            },
+            "minimax-portal-cn": {
+              models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
+            },
+            gemini: {
+              models: [{ id: "gemini-3-flash-preview", input: ["text", "image"] }],
+            },
+          },
+        },
+      } as never,
+    });
+
+    expect(providers).toContain("minimax-cn");
+    expect(providers).toContain("minimax-portal-cn");
+    expect(providers).not.toContain("gemini");
+    expect(providers).toContain("google");
+    expect(providers.indexOf("minimax-cn")).toBeLessThan(providers.indexOf("minimax"));
+    expect(providers.indexOf("minimax-portal-cn")).toBeLessThan(
+      providers.indexOf("minimax-portal"),
+    );
+  });
+
  it("keeps the bundled video fallback order", () => {
    expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual([
      "google",
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -2,7 +2,10 @@ import { resolveRuntimeConfigCacheKey } from "../config/runtime-snapshot.js";
 import type { OpenClawConfig } from "../config/types.js";
 import { normalizeOptionalString } from "../shared/string-coerce.js";
 import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js";
-import { normalizeMediaProviderId } from "./provider-registry.js";
+import {
+  normalizeMediaExecutionProviderId,
+  normalizeMediaProviderId,
+} from "./provider-registry.js";
 import { providerSupportsCapability } from "./provider-supports.js";
 import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
 export {
@@ -65,11 +68,11 @@ function resolveConfiguredImageProviderModel(params: {
  cfg?: OpenClawConfig;
  providerId: string;
 }): string | undefined {
+  const normalizedProviderId = normalizeMediaProviderId(params.providerId);
  const providers = params.cfg?.models?.providers;
  if (!providers || typeof providers !== "object") {
    return undefined;
  }
-  const normalizedProviderId = normalizeMediaProviderId(params.providerId);
  for (const [providerKey, providerCfg] of Object.entries(providers)) {
    if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) {
      continue;
@@ -93,7 +96,7 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
  }
  const configured: string[] = [];
  for (const [providerKey, providerCfg] of Object.entries(providers)) {
-    const normalizedProviderId = normalizeMediaProviderId(providerKey);
+    const normalizedProviderId = normalizeMediaExecutionProviderId(providerKey);
    if (!normalizedProviderId || configured.includes(normalizedProviderId)) {
      continue;
    }
@@ -108,14 +111,39 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
  return configured;
 }

+function isExecutionAliasProvider(providerId: string): boolean {
+  return normalizeMediaProviderId(providerId) !== providerId;
+}
+
+function insertConfiguredImageProviders(params: {
+  prioritized: string[];
+  configured: string[];
+}): string[] {
+  const merged = [...params.prioritized];
+  for (const providerId of params.configured.filter(isExecutionAliasProvider)) {
+    const canonicalProviderId = normalizeMediaProviderId(providerId);
+    const canonicalIndex = merged.indexOf(canonicalProviderId);
+    if (canonicalIndex >= 0) {
+      merged.splice(canonicalIndex, 0, providerId);
+    } else {
+      merged.unshift(providerId);
+    }
+  }
+  for (const providerId of params.configured.filter((id) => !isExecutionAliasProvider(id))) {
+    merged.push(providerId);
+  }
+  return [...new Set(merged)];
+}
+
 export function resolveDefaultMediaModel(params: {
  providerId: string;
  capability: MediaUnderstandingCapability;
  cfg?: OpenClawConfig;
  workspaceDir?: string;
  providerRegistry?: Map<string, MediaUnderstandingProvider>;
+  includeConfiguredImageModels?: boolean;
 }): string | undefined {
-  if (!params.providerRegistry) {
+  if (!params.providerRegistry && params.includeConfiguredImageModels !== false) {
    const configuredImageModel =
      params.capability === "image"
        ? resolveConfiguredImageProviderModel({
@@ -130,7 +158,13 @@ export function resolveDefaultMediaModel(params: {
  const registry =
    params.providerRegistry ?? resolveDefaultRegistry(params.cfg, params.workspaceDir);
  const provider = registry.get(normalizeMediaProviderId(params.providerId));
-  return normalizeOptionalString(provider?.defaultModels?.[params.capability]);
+  const manifestDefaultModel = normalizeOptionalString(
+    provider?.defaultModels?.[params.capability],
+  );
+  if (manifestDefaultModel) {
+    return manifestDefaultModel;
+  }
+  return undefined;
 }

 export function resolveAutoMediaKeyProviders(params: {
@@ -165,7 +199,10 @@ export function resolveAutoMediaKeyProviders(params: {
  if (params.providerRegistry || params.capability !== "image") {
    return prioritized;
  }
-  return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])];
+  return insertConfiguredImageProviders({
+    prioritized,
+    configured: resolveConfiguredImageProviderIds(params.cfg),
+  });
 }

 export function providerSupportsNativePdfDocument(params: {
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -335,6 +335,135 @@ describe("describeImageWithModel", () => {
    expect(fetchMock).toHaveBeenCalledOnce();
  });

+  it("uses canonical MiniMax CN baseUrl for VLM alias fallback", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              minimax: {
+                apiKey: "minimax-test-key",
+                baseUrl: "https://api.minimaxi.com/anthropic",
+                models: [],
+              },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax",
+      }),
+    );
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
+  it("uses MiniMax CN alias auth when the alias apiKey is a SecretRef", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              "minimax-cn": {
+                apiKey: { source: "file", provider: "default", id: "/providers/minimax-cn/apiKey" },
+                baseUrl: "https://api.minimaxi.com/anthropic",
+                models: [],
+              },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        provider: "minimax-cn",
+      }),
+    );
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
+  it("does not inherit global MiniMax baseUrl for CN VLM aliases", async () => {
+    const authStorage = {
+      setRuntimeApiKey: setRuntimeApiKeyMock,
+    };
+    resolveModelAsyncMock.mockResolvedValue({
+      authStorage,
+      modelRegistry: { find: vi.fn(() => null) },
+      error: "Unknown model: minimax-cn/MiniMax-VL-01",
+    });
+
+    await expect(
+      describeImageWithModel({
+        cfg: {
+          models: {
+            providers: {
+              minimax: { baseUrl: "https://api.minimax.io/anthropic", models: [] },
+            },
+          },
+        },
+        agentDir: "/tmp/openclaw-agent",
+        provider: "minimax-cn",
+        model: "MiniMax-VL-01",
+        buffer: Buffer.from("png-bytes"),
+        fileName: "image.png",
+        mime: "image/png",
+        prompt: "Describe the image.",
+        timeoutMs: 1000,
+      }),
+    ).resolves.toEqual({
+      text: "portal ok",
+      model: "MiniMax-VL-01",
+    });
+
+    const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
+    expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
+  });
+
  it("carries workspaceDir through image model and stream resolution", async () => {
    discoverModelsMock.mockReturnValue({
      find: vi.fn(() => ({
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -21,11 +21,13 @@ import {
  coerceImageAssistantText,
  hasImageReasoningOnlyResponse,
 } from "../agents/tools/image-tool.helpers.js";
+import { isSecretRef } from "../config/types.secrets.js";
 import {
  buildCopilotIdeHeaders,
  COPILOT_INTEGRATION_ID,
  resolveCopilotApiToken,
 } from "../plugin-sdk/provider-auth.js";
+import { normalizeMediaProviderId } from "./provider-id.js";
 import type {
  ImageDescriptionRequest,
  ImageDescriptionResult,
@@ -315,6 +317,7 @@ function buildImageRequestHeaders(model: Model<Api>): Record<string, string> | u

 async function describeImagesWithMinimax(params: {
  apiKey: string;
+  provider: string;
  modelId: string;
  modelBaseUrl?: string;
  prompt: string;
@@ -329,6 +332,7 @@ async function describeImagesWithMinimax(params: {
        : params.prompt;
    const text = await minimaxUnderstandImage({
      apiKey: params.apiKey,
+      provider: params.provider,
      prompt,
      imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
      modelBaseUrl: params.modelBaseUrl,
@@ -354,9 +358,53 @@ function resolveConfiguredProviderBaseUrl(
  if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
    return direct.baseUrl.trim();
  }
+  const normalizedProvider = normalizeMediaProviderId(provider);
+  const normalized = cfg.models?.providers?.[normalizedProvider];
+  if (typeof normalized?.baseUrl === "string" && normalized.baseUrl.trim()) {
+    if (isMinimaxCnAlias(provider) && !isMinimaxCnBaseUrl(normalized.baseUrl)) {
+      return undefined;
+    }
+    return normalized.baseUrl.trim();
+  }
  return undefined;
 }

+function isMinimaxCnAlias(provider: string): boolean {
+  const normalized = provider.trim().toLowerCase();
+  return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
+}
+
+function isMinimaxCnBaseUrl(baseUrl: string): boolean {
+  const trimmed = baseUrl.trim();
+  if (!trimmed) {
+    return false;
+  }
+  try {
+    const parsed = new URL(/^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`);
+    return parsed.hostname.toLowerCase() === "api.minimaxi.com";
+  } catch {
+    return false;
+  }
+}
+
+function hasConfiguredProviderApiKey(
+  cfg: ImageDescriptionRequest["cfg"],
+  provider: string,
+): boolean {
+  const apiKey = cfg.models?.providers?.[provider]?.apiKey;
+  return (typeof apiKey === "string" && apiKey.trim().length > 0) || isSecretRef(apiKey);
+}
+
+function resolveMinimaxVlmAuthProvider(
+  cfg: ImageDescriptionRequest["cfg"],
+  provider: string,
+): string {
+  if (!isMinimaxCnAlias(provider) || hasConfiguredProviderApiKey(cfg, provider)) {
+    return provider;
+  }
+  return normalizeMediaProviderId(provider);
+}
+
 async function resolveMinimaxVlmFallbackRuntime(params: {
  cfg: ImageDescriptionRequest["cfg"];
  agentDir: string;
@@ -365,8 +413,9 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
  profile?: string;
  preferredProfile?: string;
 }): Promise<{ apiKey: string; modelBaseUrl?: string }> {
+  const authProvider = resolveMinimaxVlmAuthProvider(params.cfg, params.provider);
  const auth = await resolveApiKeyForProvider({
-    provider: params.provider,
+    provider: authProvider,
    cfg: params.cfg,
    profileId: params.profile,
    preferredProfile: params.preferredProfile,
@@ -374,7 +423,7 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
    ...(params.workspaceDir ? { workspaceDir: params.workspaceDir } : {}),
  });
  return {
-    apiKey: requireApiKey(auth, params.provider),
+    apiKey: requireApiKey(auth, authProvider),
    modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
  };
 }
@@ -437,6 +486,7 @@ async function describeImagesWithModelInternal(
    const fallback = await resolveMinimaxVlmFallbackRuntime(params);
    return await describeImagesWithMinimax({
      apiKey: fallback.apiKey,
+      provider: params.provider,
      modelId: params.model,
      modelBaseUrl: fallback.modelBaseUrl,
      prompt,
@@ -448,6 +498,7 @@ async function describeImagesWithModelInternal(
  if (isMinimaxVlmModel(model.provider, model.id)) {
    return await describeImagesWithMinimax({
      apiKey,
+      provider: model.provider,
      modelId: model.id,
      modelBaseUrl: model.baseUrl,
      prompt,
--- a/src/media-understanding/media-understanding-misc.test.ts
+++ b/src/media-understanding/media-understanding-misc.test.ts
@@ -107,6 +107,28 @@ describe("media understanding attachments SSRF", () => {
    expect(fetchSpy).toHaveBeenCalledTimes(1);
  });

+  it("uses fetched content type instead of wildcard selection hints", async () => {
+    const url = "http://198.18.0.153/image";
+    const fetchSpy = vi.fn().mockResolvedValue(
+      new Response("image", {
+        headers: { "content-type": "image/png" },
+      }),
+    );
+    globalThis.fetch = withFetchPreconnect(fetchSpy);
+    const cache = new MediaAttachmentCache([{ index: 0, url, mime: "image/*" }], {
+      ssrfPolicy: { allowRfc2544BenchmarkRange: true },
+    });
+
+    const result = await cache.getBuffer({
+      attachmentIndex: 0,
+      maxBytes: 1024,
+      timeoutMs: 1000,
+    });
+
+    expect(result.mime).toBe("image/png");
+    expect(result.fileName).toBe("image.png");
+  });
+
  it("reads local attachments inside configured roots", async () => {
    await withLocalAttachmentCache("openclaw-media-cache-allowed-", async ({ cache }) => {
      const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
--- a/src/media-understanding/provider-id.ts
+++ b/src/media-understanding/provider-id.ts
@@ -5,5 +5,19 @@ export function normalizeMediaProviderId(id: string): string {
  if (normalized === "gemini") {
    return "google";
  }
+  if (normalized === "minimax-cn") {
+    return "minimax";
+  }
+  if (normalized === "minimax-portal-cn") {
+    return "minimax-portal";
+  }
  return normalized;
 }
+
+export function normalizeMediaExecutionProviderId(id: string): string {
+  const normalized = normalizeProviderId(id);
+  if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
+    return normalized;
+  }
+  return normalizeMediaProviderId(normalized);
+}
--- a/src/media-understanding/provider-registry.ts
+++ b/src/media-understanding/provider-registry.ts
@@ -41,7 +41,7 @@ function hydrateModelBackedMediaProvider(
  };
 }

-export { normalizeMediaProviderId } from "./provider-id.js";
+export { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";

 export function buildMediaUnderstandingRegistry(
  overrides?: Record<string, MediaUnderstandingProvider>,
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -34,6 +34,7 @@ import { MediaUnderstandingSkipError } from "./errors.js";
 import { fileExists } from "./fs.js";
 import { describeImageWithModel } from "./image-runtime.js";
 import { extractGeminiResponse } from "./output-extract.js";
+import { normalizeMediaExecutionProviderId } from "./provider-id.js";
 import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
 import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
 import type {
@@ -566,6 +567,7 @@ export async function runProviderEntry(params: {
    throw new Error(`Provider entry missing provider for ${capability}`);
  }
  const providerId = normalizeMediaProviderId(providerIdRaw);
+  const requestProviderId = normalizeMediaExecutionProviderId(providerIdRaw);
  const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
    capability,
    entry,
@@ -587,13 +589,13 @@ export async function runProviderEntry(params: {
      timeoutMs,
    });
    const requestOverrides = resolveMediaRequestOverrides(params.config);
-    const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
+    const provider = getMediaUnderstandingProvider(requestProviderId, params.providerRegistry);
    const imageInput = {
      buffer: media.buffer,
      fileName: media.fileName,
      mime: media.mime,
      model: modelId,
-      provider: providerId,
+      provider: requestProviderId,
      prompt: requestOverrides.prompt ?? prompt,
      timeoutMs,
      profile: entry.profile,
@@ -608,7 +610,7 @@ export async function runProviderEntry(params: {
      kind: "image.description",
      attachmentIndex: params.attachmentIndex,
      text: trimOutput(result.text, maxChars),
-      provider: providerId,
+      provider: requestProviderId,
      model: result.model ?? modelId,
    };
  }
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -2,6 +2,7 @@ import { constants as fsConstants } from "node:fs";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
+import { isMinimaxVlmModel, isMinimaxVlmProvider } from "../agents/minimax-vlm.js";
 import { findNormalizedProviderValue } from "../agents/provider-id.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import {
@@ -26,7 +27,7 @@ import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
 import { isMediaUnderstandingSkipError } from "./errors.js";
 import { fileExists } from "./fs.js";
 import { extractGeminiResponse } from "./output-extract.js";
-import { normalizeMediaProviderId } from "./provider-id.js";
+import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";
 import {
  buildMediaUnderstandingRegistry,
  getMediaUnderstandingProvider,
@@ -73,7 +74,7 @@ function resolveLiteralProviderApiKey(
  cfg: OpenClawConfig | undefined,
  providerId: string,
 ): string | null {
-  const value = cfg?.models?.providers?.[providerId]?.apiKey;
+  const value = findNormalizedProviderValue(cfg?.models?.providers, providerId)?.apiKey;
  return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
 }

@@ -98,11 +99,14 @@ function resolveConfiguredKeyProviderOrder(params: {
  fallbackProviders: readonly string[];
 }): string[] {
  const configuredProviders = Object.keys(params.cfg.models?.providers ?? {})
-    .map((providerId) => normalizeMediaProviderId(providerId))
+    .map((providerId) => normalizeMediaExecutionProviderId(providerId))
    .filter(Boolean)
    .filter((providerId, index, values) => values.indexOf(providerId) === index)
    .filter((providerId) =>
-      providerSupportsCapability(params.providerRegistry.get(providerId), params.capability),
+      providerSupportsCapability(
+        params.providerRegistry.get(normalizeMediaProviderId(providerId)),
+        params.capability,
+      ),
    );

  return [...new Set([...configuredProviders, ...params.fallbackProviders])];
@@ -112,6 +116,9 @@ function resolveConfiguredImageModelId(params: {
  cfg: OpenClawConfig;
  providerId: string;
 }): string | undefined {
+  if (isMinimaxVlmProvider(params.providerId)) {
+    return undefined;
+  }
  const configured = resolveConfiguredImageModel(params);
  const id = configured?.id?.trim();
  return id || undefined;
@@ -145,7 +152,7 @@ function resolveCatalogImageModelId(params: {
 }): string | undefined {
  const matches = params.catalog.filter(
    (entry) =>
-      normalizeMediaProviderId(entry.provider) === params.providerId &&
+      normalizeMediaProviderId(entry.provider) === normalizeMediaProviderId(params.providerId) &&
      params.modelSupportsVision(entry),
  );
  if (matches.length === 0) {
@@ -200,6 +207,12 @@ async function explicitImageModelVisionStatus(params: {
  providerId: string;
  model: string;
 }): Promise<"supported" | "unsupported" | "unknown"> {
+  if (
+    isMinimaxVlmProvider(params.providerId) &&
+    !isMinimaxVlmModel(params.providerId, params.model)
+  ) {
+    return "unsupported";
+  }
  const configured = resolveConfiguredImageModel(params);
  if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
    return "supported";
@@ -231,6 +244,9 @@ async function resolveAutoImageModelId(params: {
      return explicit;
    }
  }
+  if (isMinimaxVlmProvider(params.providerId)) {
+    return "MiniMax-VL-01";
+  }
  const configuredModel = resolveConfiguredImageModelId(params);
  if (configuredModel) {
    return configuredModel;
@@ -736,7 +752,7 @@ async function resolveActiveModelEntry(params: {
  if (!activeProviderRaw) {
    return null;
  }
-  const providerId = normalizeMediaProviderId(activeProviderRaw);
+  const providerId = normalizeMediaExecutionProviderId(activeProviderRaw);
  if (!providerId) {
    return null;
  }
@@ -940,6 +956,7 @@ export async function runCapability(params: {
  if (
    capability === "image" &&
    activeProvider &&
+    !isMinimaxVlmProvider(activeProvider) &&
    !hasExplicitImageUnderstandingConfig({ cfg, config })
  ) {
    const { findModelInCatalog, loadModelCatalog, modelSupportsVision } =
--- a/src/media-understanding/runner.vision-skip.test.ts
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -12,6 +12,7 @@ import { createEmptyPluginRegistry } from "../plugins/registry.js";
 import { setActivePluginRegistry } from "../plugins/runtime.js";
 import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
 import { withMediaFixture } from "./runner.test-utils.js";
+import type { MediaUnderstandingProvider } from "./types.js";

 type TestCatalogEntry = {
  id: string;
@@ -273,7 +274,7 @@ describe("runCapability image skip", () => {
          imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
        },
      },
-    } as OpenClawConfig;
+    } as unknown as OpenClawConfig;

    await expect(
      resolveAutoImageModel({
@@ -286,13 +287,13 @@ describe("runCapability image skip", () => {
    });
  });

-  it("falls back from an active text model to the provider image default", async () => {
+  it("falls back from a MiniMax chat model to the provider image default", async () => {
    catalog = [
      {
        id: "MiniMax-M2.7",
        name: "MiniMax M2.7",
        provider: "minimax-portal",
-        input: ["text"] as const,
+        input: ["text", "image"] as const,
      },
      {
        id: "MiniMax-VL-01",
@@ -302,7 +303,20 @@ describe("runCapability image skip", () => {
      },
    ];
    vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
-    const cfg = {} as OpenClawConfig;
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-portal": {
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                input: ["text", "image"],
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
    const pluginRegistry = createEmptyPluginRegistry();
    pluginRegistry.mediaUnderstandingProviders.push({
      pluginId: "minimax",
@@ -333,6 +347,300 @@ describe("runCapability image skip", () => {
    }
  });

+  it("does not native-skip MiniMax chat models that claim image input", async () => {
+    catalog = [
+      {
+        id: "MiniMax-M2.7",
+        name: "MiniMax M2.7",
+        provider: "minimax-portal",
+        input: ["text", "image"] as const,
+      },
+    ];
+    vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-portal": {
+            models: [
+              {
+                id: "MiniMax-M2.7",
+                input: ["text", "image"],
+              },
+            ],
+          },
+        },
+      },
+    } as unknown as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "minimax",
+      pluginName: "MiniMax Provider",
+      source: "test",
+      provider: {
+        id: "minimax-portal",
+        capabilities: ["image"],
+        defaultModels: { image: "MiniMax-VL-01" },
+        describeImage: async (req) => ({ text: "vlm ok", model: req.model }),
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await withMediaFixture(
+        {
+          filePrefix: "openclaw-minimax-vlm-no-native-skip",
+          extension: "png",
+          mediaType: "image/png",
+          fileContents: Buffer.from("image"),
+        },
+        async ({ ctx, media, cache }) => {
+          const result = await runCapability({
+            capability: "image",
+            cfg,
+            ctx,
+            attachments: cache,
+            media,
+            agentDir: "/tmp",
+            providerRegistry: buildProviderRegistry(undefined, cfg),
+            activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
+          });
+
+          expect(result.decision.outcome).toBe("success");
+          expect(requireCapabilityOutput(result, 0)).toEqual({
+            kind: "image.description",
+            attachmentIndex: 0,
+            provider: "minimax-portal",
+            model: "MiniMax-VL-01",
+            text: "vlm ok",
+          });
+        },
+      );
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
+  it("preserves MiniMax CN aliases from configured provider routing", async () => {
+    const seenProviders: string[] = [];
+    const cfg = {
+      models: {
+        providers: {
+          "minimax-cn": {
+            apiKey: "test-minimax-key",
+            baseUrl: "https://api.minimaxi.com/anthropic",
+            models: [],
+          },
+        },
+      },
+    } as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "minimax",
+      pluginName: "MiniMax Provider",
+      source: "test",
+      provider: {
+        id: "minimax",
+        capabilities: ["image"],
+        defaultModels: { image: "MiniMax-VL-01" },
+        describeImage: async (req) => {
+          seenProviders.push(req.provider);
+          return { text: "cn vlm ok", model: req.model };
+        },
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await withMediaFixture(
+        {
+          filePrefix: "openclaw-minimax-cn-provider",
+          extension: "png",
+          mediaType: "image/png",
+          fileContents: Buffer.from("image"),
+        },
+        async ({ ctx, media, cache }) => {
+          const result = await runCapability({
+            capability: "image",
+            cfg,
+            ctx,
+            attachments: cache,
+            media,
+            agentDir: "/tmp",
+            providerRegistry: buildProviderRegistry(undefined, cfg),
+          });
+
+          expect(result.decision.outcome).toBe("success");
+          expect(seenProviders).toEqual(["minimax-cn"]);
+          expect(requireCapabilityOutput(result, 0)).toEqual({
+            kind: "image.description",
+            attachmentIndex: 0,
+            provider: "minimax-cn",
+            model: "MiniMax-VL-01",
+            text: "cn vlm ok",
+          });
+        },
+      );
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
+  it("keeps MiniMax auto routing on VLM when registry lacks a default model", async () => {
+    let seenModel: string | undefined;
+    await withMediaFixture(
+      {
+        filePrefix: "openclaw-minimax-vlm-default",
+        extension: "png",
+        mediaType: "image/png",
+        fileContents: Buffer.from("image"),
+      },
+      async ({ ctx, media, cache }) => {
+        const cfg = {
+          models: {
+            providers: {
+              minimax: {
+                apiKey: "test-minimax-key",
+                baseUrl: "https://api.minimax.io/anthropic",
+                models: [
+                  {
+                    id: "MiniMax-M2.5",
+                    name: "MiniMax M2.5",
+                    reasoning: false,
+                    input: ["text", "image"],
+                    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+                    contextWindow: 128_000,
+                    maxTokens: 8_192,
+                  },
+                ],
+              },
+            },
+          },
+        } as OpenClawConfig;
+
+        const result = await runCapability({
+          capability: "image",
+          cfg,
+          ctx,
+          attachments: cache,
+          media,
+          agentDir: "/tmp",
+          providerRegistry: new Map([
+            [
+              "minimax",
+              {
+                id: "minimax",
+                capabilities: ["image"],
+                describeImage: async (req) => {
+                  seenModel = req.model;
+                  return { text: "vlm ok", model: req.model };
+                },
+              },
+            ],
+          ]),
+        });
+
+        expect(result.decision.outcome).toBe("success");
+        expect(seenModel).toBe("MiniMax-VL-01");
+        expect(requireCapabilityOutput(result, 0)).toMatchObject({
+          provider: "minimax",
+          model: "MiniMax-VL-01",
+          text: "vlm ok",
+        });
+      },
+    );
+  });
+
+  it("keeps non-MiniMax media aliases canonical for image execution", async () => {
+    const seenProviders: string[] = [];
+    const cfg = {
+      tools: {
+        media: {
+          image: {
+            models: [{ provider: "gemini", model: "gemini-3-flash-preview" }],
+          },
+        },
+      },
+    } as OpenClawConfig;
+    const providerRegistry = new Map<string, MediaUnderstandingProvider>([
+      [
+        "google",
+        {
+          id: "google",
+          capabilities: ["image" as const],
+          describeImage: async (req) => {
+            seenProviders.push(req.provider);
+            return { text: "google ok", model: req.model };
+          },
+        },
+      ],
+    ]);
+
+    await withMediaFixture(
+      {
+        filePrefix: "openclaw-gemini-media-alias",
+        extension: "png",
+        mediaType: "image/png",
+        fileContents: Buffer.from("image"),
+      },
+      async ({ ctx, media, cache }) => {
+        const result = await runCapability({
+          capability: "image",
+          cfg,
+          ctx,
+          attachments: cache,
+          media,
+          agentDir: "/tmp",
+          providerRegistry,
+        });
+
+        expect(result.decision.outcome).toBe("success");
+        expect(seenProviders).toEqual(["google"]);
+        expect(requireCapabilityOutput(result, 0)).toEqual({
+          kind: "image.description",
+          attachmentIndex: 0,
+          provider: "google",
+          model: "gemini-3-flash-preview",
+          text: "google ok",
+        });
+      },
+    );
+  });
+
+  it("canonicalizes non-MiniMax active media aliases for auto image resolution", async () => {
+    vi.stubEnv("GEMINI_API_KEY", "test-gemini-key");
+    const cfg = {} as OpenClawConfig;
+    const pluginRegistry = createEmptyPluginRegistry();
+    pluginRegistry.mediaUnderstandingProviders.push({
+      pluginId: "google",
+      pluginName: "Google Provider",
+      source: "test",
+      provider: {
+        id: "google",
+        capabilities: ["image"],
+        defaultModels: { image: "gemini-3-flash-preview" },
+        describeImage: async () => ({ text: "ok" }),
+      },
+    });
+    setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
+
+    try {
+      await expect(
+        resolveAutoImageModel({
+          cfg,
+          activeModel: { provider: "gemini", model: "gemini-3-flash-preview" },
+        }),
+      ).resolves.toEqual({
+        provider: "google",
+        model: "gemini-3-flash-preview",
+      });
+    } finally {
+      setActivePluginRegistry(createEmptyPluginRegistry());
+      vi.unstubAllEnvs();
+    }
+  });
+
  it("uses active OpenRouter image models for auto image resolution", async () => {
    vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
    const cfg = {} as OpenClawConfig;
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -67,6 +67,10 @@ describe("media-understanding runtime", () => {
  afterEach(() => {
    mocks.buildProviderRegistry.mockReset();
    mocks.createMediaAttachmentCache.mockReset();
+    mocks.createMediaAttachmentCache.mockReturnValue({
+      cleanup: mocks.cleanup,
+      getBuffer: mocks.getBuffer,
+    });
    mocks.normalizeMediaAttachments.mockReset();
    mocks.normalizeMediaProviderId.mockReset();
    mocks.buildMediaUnderstandingRegistry.mockReset();
@@ -186,6 +190,76 @@ describe("media-understanding runtime", () => {
    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
  });

+  it("classifies extensionless remote image URLs before capability filtering", async () => {
+    const output: MediaUnderstandingOutput = {
+      kind: "image.description",
+      attachmentIndex: 0,
+      provider: "vision-plugin",
+      model: "vision-v1",
+      text: "image ok",
+    };
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, url: "https://httpbin.org/image/png", mime: "image/*" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [output],
+    });
+
+    await expect(
+      describeImageFile({
+        filePath: "https://httpbin.org/image/png",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toEqual({
+      text: "image ok",
+      provider: "vision-plugin",
+      model: "vision-v1",
+      output,
+    });
+
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://httpbin.org/image/png",
+      MediaType: "image/*",
+    });
+    expect(requireRunCapabilityRequest()).toMatchObject({
+      ctx: {
+        MediaUrl: "https://httpbin.org/image/png",
+        MediaType: "image/*",
+      },
+    });
+  });
+
+  it("does not force typed remote URLs into the requested capability", async () => {
+    const media = [{ index: 0, url: "https://example.com/clip.mp4", mime: "video/mp4" }];
+    mocks.normalizeMediaAttachments.mockReturnValue(media);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision: { capability: "image", outcome: "skipped", attachments: [] },
+    });
+
+    await expect(
+      describeImageFile({
+        filePath: "https://example.com/clip.mp4",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toMatchObject({
+      text: undefined,
+      output: undefined,
+    });
+
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://example.com/clip.mp4",
+      MediaType: "video/mp4",
+    });
+    expect(requireRunCapabilityRequest()).toMatchObject({
+      capability: "image",
+      ctx: { MediaUrl: "https://example.com/clip.mp4", MediaType: "video/mp4" },
+      media,
+    });
+  });
+
  it("passes workspaceDir through file media understanding requests", async () => {
    const output: MediaUnderstandingOutput = {
      kind: "image.description",
@@ -395,6 +469,7 @@ describe("media-understanding runtime", () => {
    await describeImageFileWithModel({
      filePath: "https://example.com/photo.png",
      mediaUrl: "https://example.com/photo.png",
+      mime: "image/*",
      provider: "zai",
      model: "glm-4.6v",
      prompt: "Describe it",
@@ -412,6 +487,58 @@ describe("media-understanding runtime", () => {
    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
  });

+  it("fetches remote explicit image descriptions through the media attachment cache", async () => {
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, url: "https://httpbin.org/image/png", mime: "image/png" },
+    ]);
+    mocks.buildProviderRegistry.mockReturnValue(
+      new Map([["zai", { id: "zai", capabilities: ["image"] }]]),
+    );
+    mocks.getBuffer.mockResolvedValue({
+      buffer: Buffer.from("remote-png"),
+      fileName: "png",
+      mime: "image/png",
+      size: 10,
+    });
+
+    await expect(
+      describeImageFileWithModel({
+        filePath: "https://httpbin.org/image/png",
+        provider: "zai",
+        model: "glm-4.6v",
+        prompt: "Describe it",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+        timeoutMs: 45_000,
+      }),
+    ).resolves.toEqual({ text: "generic image ok", model: "vision" });
+
+    expect(mocks.readLocalFileSafely).not.toHaveBeenCalled();
+    expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
+      MediaUrl: "https://httpbin.org/image/png",
+      MediaType: "image/*",
+    });
+    expect(mocks.createMediaAttachmentCache).toHaveBeenCalledWith(
+      [{ index: 0, url: "https://httpbin.org/image/png", mime: "image/png" }],
+      { ssrfPolicy: undefined },
+    );
+    expect(mocks.getBuffer).toHaveBeenCalledWith({
+      attachmentIndex: 0,
+      maxBytes: 10 * 1024 * 1024,
+      timeoutMs: 45_000,
+    });
+    expect(mocks.describeImageWithModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        buffer: Buffer.from("remote-png"),
+        fileName: "png",
+        mime: "image/png",
+        provider: "zai",
+        model: "glm-4.6v",
+      }),
+    );
+    expect(mocks.cleanup).toHaveBeenCalledOnce();
+  });
+
  it("routes direct image description through a provider-specific image hook", async () => {
    const describeImage = vi.fn(async () => ({
      text: "image ok",
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -1,5 +1,7 @@
 import path from "node:path";
+import type { OpenClawConfig } from "../config/types.js";
 import { readLocalFileSafely } from "../infra/fs-safe.js";
+import { kindFromMime, mimeTypeFromFilePath } from "../media/mime.js";
 import { DEFAULT_MAX_BYTES } from "./defaults.constants.js";
 import { describeImageWithModel } from "./image-runtime.js";
 import {
@@ -48,13 +50,61 @@ function resolveDecisionFailureReason(
  return normalizeDecisionReason(findDecisionReason(decision, "failed"));
 }

-function buildFileContext(params: { filePath: string; mediaUrl?: string; mime?: string }) {
+function buildFileContext(params: {
+  filePath: string;
+  mediaUrl?: string;
+  mime?: string;
+  capability?: MediaUnderstandingCapability;
+}) {
+  const remoteRef =
+    params.mediaUrl ??
+    (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
+  const extensionMime = remoteRef ? mimeTypeFromFilePath(remoteRef) : undefined;
+  const extensionKind = kindFromMime(extensionMime);
+  const mediaType =
+    params.mime ??
+    (remoteRef && params.capability && extensionKind === params.capability
+      ? `${params.capability}/*`
+      : extensionMime) ??
+    (remoteRef && params.capability ? `${params.capability}/*` : undefined);
+  if (remoteRef) {
+    return {
+      MediaUrl: remoteRef,
+      MediaType: mediaType,
+    };
+  }
  return {
-    ...(params.mediaUrl ? { MediaUrl: params.mediaUrl } : { MediaPath: params.filePath }),
-    MediaType: params.mime,
+    MediaPath: params.filePath,
+    MediaType: mediaType,
  };
 }

+function isRemoteMediaReference(value: string): boolean {
+  return /^https?:\/\//i.test(value.trim());
+}
+
+function concreteMime(mime: string | undefined): string | undefined {
+  const normalized = mime?.trim();
+  if (!normalized || normalized.endsWith("/*")) {
+    return undefined;
+  }
+  return normalized;
+}
+
+function resolveFileLocalRoots(filePath: string): string[] | undefined {
+  return isRemoteMediaReference(filePath) ? undefined : [path.dirname(filePath)];
+}
+
+function basenameFromMediaReference(value: string): string {
+  if (isRemoteMediaReference(value)) {
+    try {
+      const url = new URL(value);
+      return path.basename(url.pathname) || "image";
+    } catch {}
+  }
+  return path.basename(value);
+}
+
 function hasStructuredImageInput(input: ExtractStructuredWithModelParams["input"]): boolean {
  return input.some((entry) => entry.type === "image");
 }
@@ -93,7 +143,7 @@ export async function runMediaUnderstandingFile(
          },
        }
      : params.cfg;
-  const ctx = buildFileContext(params);
+  const ctx = buildFileContext({ ...params, capability: params.capability });
  const attachments = normalizeMediaAttachments(ctx);
  if (attachments.length === 0) {
    return {
@@ -114,7 +164,7 @@ export async function runMediaUnderstandingFile(

  const providerRegistry = buildProviderRegistry(undefined, cfg);
  const cache = createMediaAttachmentCache(attachments, {
-    localPathRoots: [path.dirname(params.filePath)],
+    localPathRoots: params.mediaUrl ? undefined : resolveFileLocalRoots(params.filePath),
    ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
  });

@@ -166,33 +216,18 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
  const timeoutMs = params.timeoutMs ?? 30_000;
  const providerRegistry = buildProviderRegistry(undefined, params.cfg);
  const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
-  let buffer: Buffer;
-  let fileName = path.basename(params.filePath);
-  let mime = params.mime;
-  if (params.mediaUrl) {
-    const cache = createMediaAttachmentCache(normalizeMediaAttachments(buildFileContext(params)), {
-      ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
-    });
-    try {
-      const media = await cache.getBuffer({
-        attachmentIndex: 0,
-        maxBytes: DEFAULT_MAX_BYTES.image,
-        timeoutMs,
-      });
-      buffer = media.buffer;
-      fileName = media.fileName;
-      mime = media.mime;
-    } finally {
-      await cache.cleanup();
-    }
-  } else {
-    buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer;
-  }
+  const image = await readImageDescriptionInput({
+    filePath: params.filePath,
+    mediaUrl: params.mediaUrl,
+    mime: params.mime,
+    cfg: params.cfg,
+    timeoutMs,
+  });
  const describeImage = provider?.describeImage ?? describeImageWithModel;
  return await describeImage({
-    buffer,
-    fileName,
-    mime,
+    buffer: image.buffer,
+    fileName: image.fileName,
+    mime: image.mime,
    provider: params.provider,
    model: params.model,
    prompt: params.prompt,
@@ -204,6 +239,45 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
  });
 }

+async function readImageDescriptionInput(params: {
+  filePath: string;
+  mediaUrl?: string;
+  mime?: string;
+  cfg: OpenClawConfig;
+  timeoutMs: number;
+}): Promise<{ buffer: Buffer; fileName: string; mime?: string }> {
+  const remoteRef =
+    params.mediaUrl ??
+    (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
+  if (!remoteRef) {
+    return {
+      buffer: (await readLocalFileSafely({ filePath: params.filePath })).buffer,
+      fileName: basenameFromMediaReference(params.filePath),
+      mime: params.mime,
+    };
+  }
+  const attachments = normalizeMediaAttachments(
+    buildFileContext({ ...params, capability: "image" }),
+  );
+  const cache = createMediaAttachmentCache(attachments, {
+    ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
+  });
+  try {
+    const media = await cache.getBuffer({
+      attachmentIndex: 0,
+      maxBytes: DEFAULT_MAX_BYTES.image,
+      timeoutMs: params.timeoutMs,
+    });
+    return {
+      buffer: media.buffer,
+      fileName: media.fileName || basenameFromMediaReference(remoteRef),
+      mime: concreteMime(params.mime) ?? media.mime,
+    };
+  } finally {
+    await cache.cleanup();
+  }
+}
+
 export async function extractStructuredWithModel(params: ExtractStructuredWithModelParams) {
  const timeoutMs = params.timeoutMs ?? 30_000;
  if (!hasStructuredImageInput(params.input)) {