From 5d1f7bf058949da6be6a614a66a4e1d1fa8f245e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 17 May 2026 08:45:50 +0100 Subject: [PATCH] fix: route image URL describes through MiniMax VLM Summary: - Preserve HTTP image describe inputs as remote media. - Route MiniMax CN image understanding through MiniMax-VL-01. - Cover CLI, media runtime, tools, Telegram stickers, docs, and changelog. Verification: - codex-review clean - pnpm check:changed via Blacksmith Testbox tbx_01krtdekwak0mygxbw5z7cfb6z - PR CI green on 516281448e6d5499ce17928d820f1c4d24a0b612 --- CHANGELOG.md | 1 + docs/cli/infer.md | 28 +- docs/nodes/media-understanding.md | 4 +- .../src/sticker-cache.describe.test.ts | 125 +++++++ extensions/telegram/src/sticker-cache.ts | 26 +- .../minimax-vlm.normalizes-api-key.test.ts | 57 ++++ src/agents/minimax-vlm.ts | 27 +- src/agents/tools/image-tool.helpers.ts | 4 + src/agents/tools/image-tool.test.ts | 125 +++++++ src/agents/tools/image-tool.ts | 67 +++- .../tools/pdf-tool.model-config.test.ts | 32 ++ src/agents/tools/pdf-tool.model-config.ts | 4 + src/cli/capability-cli.test.ts | 42 +++ src/cli/capability-cli.ts | 9 +- src/media-understanding/attachments.cache.ts | 12 +- src/media-understanding/defaults.test.ts | 54 +++ src/media-understanding/defaults.ts | 49 ++- src/media-understanding/image.test.ts | 129 +++++++ src/media-understanding/image.ts | 55 ++- .../media-understanding-misc.test.ts | 22 ++ src/media-understanding/provider-id.ts | 14 + src/media-understanding/provider-registry.ts | 2 +- src/media-understanding/runner.entries.ts | 8 +- src/media-understanding/runner.ts | 29 +- .../runner.vision-skip.test.ts | 316 +++++++++++++++++- src/media-understanding/runtime.test.ts | 127 +++++++ src/media-understanding/runtime.ts | 134 ++++++-- 27 files changed, 1425 insertions(+), 77 deletions(-) create mode 100644 extensions/telegram/src/sticker-cache.describe.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 80aaa56905c..f75edb650b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -85,6 +85,7 @@ Docs: https://docs.openclaw.ai - Agents/followups: route queued followup turns through CLI runtime backends instead of embedded harness lookup, preventing `claude-cli`/`google-gemini-cli` followups from failing before delivery. Fixes #82847. (#82857) Thanks @hclsys. - CLI/sessions: let `openclaw sessions cleanup --fix-missing` prune malformed rows with unresolvable transcript metadata instead of throwing. Fixes #80970. (#82745) Thanks @IWhatsskill. - Gateway/usage: refresh large session usage summaries in the background and reuse durable transcript metadata so `sessions.usage` no longer blocks Gateway requests on full transcript rescans. Fixes #82773. (#82778) Thanks @hclsys. +- CLI/MiniMax media: let `openclaw infer image describe --file` accept HTTP(S) image URLs without treating them as local paths, and keep automatic MiniMax image understanding routed through `MiniMax-VL-01` even when legacy MiniMax M2.x chat metadata claims image input. Fixes #82837. Thanks @mGaolin. - TUI: restore the submitted draft when chat is busy instead of clearing it or queueing another run. Fixes #45326. (#82774) Thanks @hyspacex. - Cron/memory: treat claimed `before_agent_reply` cron hooks as execution progress, so long memory dreaming promotion jobs are not aborted by the isolated-run pre-execution watchdog. Fixes #82811. - Discord: recover transcript-backed full answers when progress-mode final payloads are ellipsis-truncated, so long replies fall back to normal chunked delivery instead of replacing the preview with a shortened message. Fixes #82807. Thanks @blueberry6401. diff --git a/docs/cli/infer.md b/docs/cli/infer.md index b28b75ec438..98f8b5b905e 100644 --- a/docs/cli/infer.md +++ b/docs/cli/infer.md @@ -107,19 +107,19 @@ runtime before the provider request is made. This table maps common inference tasks to the corresponding infer command. -| Task | Command | Notes | -| ---------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- | -| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default | -| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs | -| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file | -| Describe an image file | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `` | -| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `` | -| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented | -| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` | -| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `` | -| Search the web | `openclaw infer web search --query "..." --json` | | -| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | | -| Create embeddings | `openclaw infer embedding create --text "..." --json` | | +| Task | Command | Notes | +| ----------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- | +| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default | +| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs | +| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file | +| Describe an image file or URL | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `` | +| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `` | +| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented | +| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` | +| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `` | +| Search the web | `openclaw infer web search --query "..." --json` | | +| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | | +| Create embeddings | `openclaw infer embedding create --text "..." --json` | | ## Behavior @@ -128,6 +128,7 @@ This table maps common inference tasks to the corresponding infer command. - Use `--provider` or `--model provider/model` when a specific backend is required. - Use `model run --thinking ` to pass a one-shot thinking/reasoning level (`off`, `minimal`, `low`, `medium`, `high`, `adaptive`, `xhigh`, or `max`) while keeping the run raw. - For `image describe`, `audio transcribe`, and `video describe`, `--model` must use the form ``. +- For `image describe`, `--file` accepts local paths and HTTP(S) image URLs. Remote URLs use the normal media-fetch SSRF policy. - For `image describe`, an explicit `--model` runs that provider/model directly. The model must be image-capable in the model catalog or provider config. `codex/` runs a bounded Codex app-server image-understanding turn; `openai-codex/` uses the OpenAI Codex OAuth provider path. - Stateless execution commands default to local. - Gateway-managed state commands default to gateway. @@ -192,6 +193,7 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000 openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json openclaw infer image describe --file ./photo.jpg --json +openclaw infer image describe --file https://example.com/photo.png --json openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 8c48588e494..40714dae2f4 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -260,8 +260,8 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches. **MiniMax note** -- `minimax` and `minimax-portal` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider. -- The bundled MiniMax text catalog still starts text-only; explicit `models.providers.minimax` entries materialize image-capable M2.7 chat refs. +- `minimax`, `minimax-cn`, `minimax-portal`, and `minimax-portal-cn` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider. +- Automatic image routing keeps using `MiniMax-VL-01` even if legacy MiniMax M2.x chat metadata claims image input. diff --git a/extensions/telegram/src/sticker-cache.describe.test.ts b/extensions/telegram/src/sticker-cache.describe.test.ts new file mode 100644 index 00000000000..803750a2e71 --- /dev/null +++ b/extensions/telegram/src/sticker-cache.describe.test.ts @@ -0,0 +1,125 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { describeStickerImage } from "./sticker-cache.js"; + +const mocks = vi.hoisted(() => { + const describeImageFileWithModel = vi.fn(async () => ({ + text: "vlm ok", + model: "MiniMax-VL-01", + })); + return { + describeImageFileWithModel, + findModelInCatalog: vi.fn((_catalog, provider: string, model: string) => ({ + provider, + id: model, + input: ["text", "image"], + })), + loadModelCatalog: vi.fn(async () => [ + { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] }, + { provider: "minimax", id: "MiniMax-M2.7", input: ["text", "image"] }, + ]), + modelSupportsVision: vi.fn((entry: { input?: string[] } | undefined) => + Boolean(entry?.input?.includes("image")), + ), + resolveApiKeyForProvider: vi.fn(async () => ({ apiKey: "minimax-test" })), + resolveAutoImageModel: vi.fn(async () => ({ + provider: "minimax-cn", + model: "MiniMax-VL-01", + })), + resolveAutoMediaKeyProviders: vi.fn(() => ["minimax-cn", "minimax"]), + resolveDefaultMediaModel: vi.fn(() => "MiniMax-VL-01"), + resolveDefaultModelForAgent: vi.fn(() => ({ + provider: "minimax-cn", + model: "MiniMax-M2.7", + })), + }; +}); + +vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({ + findModelInCatalog: mocks.findModelInCatalog, + loadModelCatalog: mocks.loadModelCatalog, + modelSupportsVision: mocks.modelSupportsVision, + resolveApiKeyForProvider: mocks.resolveApiKeyForProvider, + resolveDefaultModelForAgent: mocks.resolveDefaultModelForAgent, +})); + +vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ + resolveAutoImageModel: mocks.resolveAutoImageModel, + resolveAutoMediaKeyProviders: mocks.resolveAutoMediaKeyProviders, + resolveDefaultMediaModel: mocks.resolveDefaultMediaModel, +})); + +vi.mock("./runtime.js", () => ({ + getTelegramRuntime: () => ({ + mediaUnderstanding: { + describeImageFileWithModel: mocks.describeImageFileWithModel, + }, + }), +})); + +describe("describeStickerImage", () => { + beforeEach(() => { + mocks.describeImageFileWithModel.mockClear(); + mocks.findModelInCatalog.mockClear(); + mocks.loadModelCatalog.mockClear(); + mocks.modelSupportsVision.mockClear(); + mocks.resolveApiKeyForProvider.mockClear(); + mocks.resolveAutoImageModel.mockClear(); + mocks.resolveAutoMediaKeyProviders.mockClear(); + mocks.resolveDefaultMediaModel.mockClear(); + mocks.resolveDefaultModelForAgent.mockClear(); + }); + + it("uses MiniMax VLM auto selection instead of legacy chat vision catalog entries", async () => { + await expect( + describeStickerImage({ + imagePath: "/tmp/sticker.webp", + cfg: {}, + agentDir: "/tmp/agent", + }), + ).resolves.toBe("vlm ok"); + + expect(mocks.resolveDefaultMediaModel).toHaveBeenCalledWith({ + cfg: {}, + providerId: "minimax-cn", + capability: "image", + includeConfiguredImageModels: false, + }); + expect(mocks.resolveAutoImageModel).not.toHaveBeenCalled(); + expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith( + expect.objectContaining({ + filePath: "/tmp/sticker.webp", + provider: "minimax-cn", + model: "MiniMax-VL-01", + }), + ); + }); + + it("keeps MiniMax chat defaults on MiniMax VLM when other vision providers are configured", async () => { + mocks.resolveAutoMediaKeyProviders.mockReturnValue(["openai", "minimax-cn", "minimax"]); + mocks.loadModelCatalog.mockResolvedValue([ + { provider: "openai", id: "gpt-5.4", input: ["text", "image"] }, + { provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] }, + { provider: "minimax-cn", id: "MiniMax-VL-01", input: ["image"] }, + ]); + + await expect( + describeStickerImage({ + imagePath: "/tmp/sticker.webp", + cfg: {}, + agentDir: "/tmp/agent", + }), + ).resolves.toBe("vlm ok"); + + expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "minimax-cn", + model: "MiniMax-VL-01", + }), + ); + expect(mocks.describeImageFileWithModel).not.toHaveBeenCalledWith( + expect.objectContaining({ + provider: "openai", + }), + ); + }); +}); diff --git a/extensions/telegram/src/sticker-cache.ts b/extensions/telegram/src/sticker-cache.ts index 3bb158fd099..a9dd3d30f78 100644 --- a/extensions/telegram/src/sticker-cache.ts +++ b/extensions/telegram/src/sticker-cache.ts @@ -27,6 +27,16 @@ export { const STICKER_DESCRIPTION_PROMPT = "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective."; +function isMinimaxVlmProvider(provider: string): boolean { + const normalized = normalizeLowercaseStringOrEmpty(provider); + return ( + normalized === "minimax" || + normalized === "minimax-cn" || + normalized === "minimax-portal" || + normalized === "minimax-portal-cn" + ); +} + export interface DescribeStickerParams { imagePath: string; cfg: OpenClawConfig; @@ -50,7 +60,17 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); const supportsVision = modelSupportsVision(entry); if (supportsVision) { - activeModel = { provider: defaultModel.provider, model: defaultModel.model }; + const model = isMinimaxVlmProvider(defaultModel.provider) + ? resolveDefaultMediaModel({ + cfg, + providerId: defaultModel.provider, + capability: "image", + includeConfiguredImageModels: false, + }) + : defaultModel.model; + if (model) { + activeModel = { provider: defaultModel.provider, model }; + } } } catch { // Ignore catalog failures; fall back to auto selection. @@ -83,8 +103,12 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi cfg, providerId: provider, capability: "image", + includeConfiguredImageModels: !isMinimaxVlmProvider(provider), }); const preferred = entries.find((entry) => entry.id === defaultId); + if (isMinimaxVlmProvider(provider)) { + return preferred; + } return preferred ?? entries[0]; }; diff --git a/src/agents/minimax-vlm.normalizes-api-key.test.ts b/src/agents/minimax-vlm.normalizes-api-key.test.ts index e4f470d93cb..31f2ba21f5e 100644 --- a/src/agents/minimax-vlm.normalizes-api-key.test.ts +++ b/src/agents/minimax-vlm.normalizes-api-key.test.ts @@ -75,6 +75,61 @@ describe("minimaxUnderstandImage apiKey normalization", () => { expect(fetchSpy).toHaveBeenCalledOnce(); }); + it.each(["minimax-cn", "minimax-portal-cn"])( + "routes %s to the CN VLM host by default", + async (provider) => { + const fetchSpy = vi.fn(async (input: RequestInfo | URL) => { + const requestUrl = + typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm"); + return new Response(apiResponse, { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }); + global.fetch = withFetchPreconnect(fetchSpy); + + await expect( + minimaxUnderstandImage({ + apiKey: "minimax-test-key", + provider, + prompt: "hi", + imageDataUrl: "data:image/png;base64,AAAA", + }), + ).resolves.toBe("ok"); + + expect(fetchSpy).toHaveBeenCalledOnce(); + }, + ); + + it.each(["minimax-cn", "minimax-portal-cn"])( + "keeps %s on the CN VLM host when the configured host is malformed", + async (provider) => { + const fetchSpy = vi.fn(async (input: RequestInfo | URL) => { + const requestUrl = + typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm"); + return new Response(apiResponse, { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }); + global.fetch = withFetchPreconnect(fetchSpy); + + await expect( + minimaxUnderstandImage({ + apiKey: "minimax-test-key", + provider, + apiHost: "https://[", + prompt: "hi", + imageDataUrl: "data:image/png;base64,AAAA", + }), + ).resolves.toBe("ok"); + + expect(fetchSpy).toHaveBeenCalledOnce(); + }, + ); + it("uses the caller-provided request timeout", async () => { const timeoutSpy = vi.spyOn(AbortSignal, "timeout"); const fetchSpy = vi.fn(async () => { @@ -103,7 +158,9 @@ describe("minimaxUnderstandImage apiKey normalization", () => { describe("isMinimaxVlmModel", () => { it("only matches the canonical MiniMax VLM model id", () => { expect(isMinimaxVlmModel("minimax", "MiniMax-VL-01")).toBe(true); + expect(isMinimaxVlmModel("minimax-cn", "MiniMax-VL-01")).toBe(true); expect(isMinimaxVlmModel("minimax-portal", "MiniMax-VL-01")).toBe(true); + expect(isMinimaxVlmModel("minimax-portal-cn", "MiniMax-VL-01")).toBe(true); expect(isMinimaxVlmModel("minimax-portal", "custom-vision")).toBe(false); expect(isMinimaxVlmModel("openai", "MiniMax-VL-01")).toBe(false); }); diff --git a/src/agents/minimax-vlm.ts b/src/agents/minimax-vlm.ts index bde911a17e0..fa002249515 100644 --- a/src/agents/minimax-vlm.ts +++ b/src/agents/minimax-vlm.ts @@ -8,35 +8,54 @@ type MinimaxBaseResp = { }; export function isMinimaxVlmProvider(provider: string): boolean { - return provider === "minimax" || provider === "minimax-portal"; + const normalized = provider.trim().toLowerCase(); + return ( + normalized === "minimax" || + normalized === "minimax-cn" || + normalized === "minimax-portal" || + normalized === "minimax-portal-cn" + ); } export function isMinimaxVlmModel(provider: string, modelId: string): boolean { return isMinimaxVlmProvider(provider) && modelId.trim() === "MiniMax-VL-01"; } +function isMinimaxCnProvider(provider: string | undefined): boolean { + const normalized = provider?.trim().toLowerCase(); + return normalized === "minimax-cn" || normalized === "minimax-portal-cn"; +} + function coerceApiHost(params: { apiHost?: string; modelBaseUrl?: string; + provider?: string; env?: NodeJS.ProcessEnv; }): string { const env = params.env ?? process.env; + const defaultHost = isMinimaxCnProvider(params.provider) + ? "https://api.minimaxi.com" + : "https://api.minimax.io"; const raw = params.apiHost?.trim() || env.MINIMAX_API_HOST?.trim() || params.modelBaseUrl?.trim() || - "https://api.minimax.io"; + defaultHost; try { const url = new URL(raw); return url.origin; } catch {} + if (/^[a-z][a-z\d+.-]*:\/\//i.test(raw)) { + return defaultHost; + } + try { const url = new URL(`https://${raw}`); return url.origin; } catch { - return "https://api.minimax.io"; + return defaultHost; } } @@ -51,6 +70,7 @@ export async function minimaxUnderstandImage(params: { imageDataUrl: string; apiHost?: string; modelBaseUrl?: string; + provider?: string; timeoutMs?: number; }): Promise { const apiKey = normalizeSecretInput(params.apiKey); @@ -72,6 +92,7 @@ export async function minimaxUnderstandImage(params: { const host = coerceApiHost({ apiHost: params.apiHost, modelBaseUrl: params.modelBaseUrl, + provider: params.provider, }); const url = new URL("/v1/coding_plan/vlm", host).toString(); diff --git a/src/agents/tools/image-tool.helpers.ts b/src/agents/tools/image-tool.helpers.ts index ab7a178c6d8..58508304624 100644 --- a/src/agents/tools/image-tool.helpers.ts +++ b/src/agents/tools/image-tool.helpers.ts @@ -2,6 +2,7 @@ import type { AssistantMessage } from "@earendil-works/pi-ai"; import type { OpenClawConfig } from "../../config/types.openclaw.js"; import { estimateBase64DecodedBytes } from "../../media/base64.js"; import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js"; +import { isMinimaxVlmProvider } from "../minimax-vlm.js"; import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js"; @@ -238,6 +239,9 @@ export function resolveProviderVisionModelFromConfig(params: { cfg?: OpenClawConfig; provider: string; }): string | null { + if (isMinimaxVlmProvider(params.provider)) { + return null; + } const providerCfg = findNormalizedProviderValue( params.cfg?.models?.providers, params.provider, diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 383c5247a23..601a66bee83 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -181,7 +181,9 @@ async function createOpenClawCodingToolsWithFreshModules(options?: CreateOpenCla const defaultImageModels = new Map([ ["anthropic", "claude-opus-4-6"], ["minimax", "MiniMax-VL-01"], + ["minimax-cn", "MiniMax-VL-01"], ["minimax-portal", "MiniMax-VL-01"], + ["minimax-portal-cn", "MiniMax-VL-01"], ["openai", "gpt-5.4-mini"], ["opencode", "gpt-5-nano"], ["opencode-go", "kimi-k2.6"], @@ -482,7 +484,9 @@ function installImageUnderstandingProviderStubs(...providers: MediaUnderstanding const defaultImageModels = new Map([ ["anthropic", "claude-opus-4-6"], ["minimax", "MiniMax-VL-01"], + ["minimax-cn", "MiniMax-VL-01"], ["minimax-portal", "MiniMax-VL-01"], + ["minimax-portal-cn", "MiniMax-VL-01"], ["openai", "gpt-5.4-mini"], ["opencode", "gpt-5-nano"], ["opencode-go", "kimi-k2.6"], @@ -764,6 +768,127 @@ describe("image tool implicit imageModel config", () => { }); }); + it("keeps MiniMax CN chat metadata off automatic image routing", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "minimax-cn/MiniMax-M2.5" } } }, + models: { + mode: "merge", + providers: { + "minimax-cn": { + baseUrl: "https://api.minimaxi.com/anthropic", + apiKey: "${MINIMAX_API_KEY}", + api: "anthropic-messages", + models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])], + }, + }, + }, + }; + const authStore = { + version: 1, + profiles: { + mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" }, + miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" }, + }, + } as const; + + expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({ + primary: "minimax-cn/MiniMax-VL-01", + }); + }); + }); + + it("prefers configured MiniMax CN image alias over canonical auto fallback", async () => { + await withTempAgentDir(async (agentDir) => { + const defaultImageModels = new Map([ + ["anthropic", "claude-opus-4-6"], + ["minimax", "MiniMax-VL-01"], + ["minimax-cn", "MiniMax-VL-01"], + ["openai", "gpt-5.4-mini"], + ]); + __testing.setProviderDepsForTest({ + buildProviderRegistry: (overrides?: Record) => + imageProviderHarness.buildProviderRegistry(overrides), + getMediaUnderstandingProvider: ( + id: string, + registry: Map, + ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry), + describeImageWithModel: describeGenericImageWithModel, + describeImagesWithModel: describeGenericImagesWithModel, + resolveAutoMediaKeyProviders: ({ capability }) => + capability === "image" ? ["openai", "anthropic", "minimax-cn", "minimax"] : [], + resolveDefaultMediaModel: ({ providerId, capability }) => + capability === "image" ? defaultImageModels.get(providerId.toLowerCase()) : undefined, + }); + const cfg: OpenClawConfig = { + models: { + mode: "merge", + providers: { + "minimax-cn": { + baseUrl: "https://api.minimaxi.com/anthropic", + apiKey: "${MINIMAX_API_KEY}", + api: "anthropic-messages", + models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])], + }, + }, + }, + }; + const authStore = { + version: 1, + profiles: { + mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" }, + miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" }, + }, + } as const; + + expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({ + primary: "minimax-cn/MiniMax-VL-01", + }); + }); + }); + + it("keeps canonical MiniMax fallback when configured CN alias has no image candidate", async () => { + await withTempAgentDir(async (agentDir) => { + __testing.setProviderDepsForTest({ + buildProviderRegistry: (overrides?: Record) => + imageProviderHarness.buildProviderRegistry(overrides), + getMediaUnderstandingProvider: ( + id: string, + registry: Map, + ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry), + describeImageWithModel: describeGenericImageWithModel, + describeImagesWithModel: describeGenericImagesWithModel, + resolveAutoMediaKeyProviders: ({ capability }) => + capability === "image" ? ["minimax"] : [], + resolveDefaultMediaModel: ({ providerId, capability }) => + capability === "image" && providerId === "minimax" ? "MiniMax-VL-01" : undefined, + }); + const cfg: OpenClawConfig = { + models: { + mode: "merge", + providers: { + "minimax-cn": { + baseUrl: "https://api.minimaxi.com/anthropic", + apiKey: "${MINIMAX_API_KEY}", + api: "anthropic-messages", + models: [], + }, + }, + }, + }; + const authStore = { + version: 1, + profiles: { + miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" }, + }, + } as const; + + expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({ + primary: "minimax/MiniMax-VL-01", + }); + }); + }); + it("passes the configured image timeout to provider calls", async () => { await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => { await withTempAgentDir(async (agentDir) => { diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 6eeec03d4e5..9eb64e99f1e 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -68,6 +68,50 @@ const imageToolProviderDeps = { resolveDefaultMediaModel, }; +function hasExplicitDefaultPrimaryModel(cfg?: OpenClawConfig): boolean { + const model = cfg?.agents?.defaults?.model; + if (typeof model === "string") { + return model.trim().length > 0; + } + return typeof model?.primary === "string" && model.primary.trim().length > 0; +} + +function modelRefProvider(candidate: string | null | undefined): string | undefined { + const trimmed = candidate?.trim(); + if (!trimmed?.includes("/")) { + return undefined; + } + return trimmed.slice(0, trimmed.indexOf("/")).trim(); +} + +function isExecutionAliasCandidateForProvider( + candidate: string | null | undefined, + provider: string, +): boolean { + const candidateProvider = modelRefProvider(candidate); + return Boolean( + candidateProvider && + candidateProvider !== normalizeMediaProviderId(candidateProvider) && + normalizeMediaProviderId(candidateProvider) === normalizeMediaProviderId(provider), + ); +} + +function isCanonicalCandidateShadowedByExecutionAlias( + candidate: string | null | undefined, + candidates: readonly (string | null | undefined)[], +): boolean { + const candidateProvider = modelRefProvider(candidate); + if (!candidateProvider || candidateProvider !== normalizeMediaProviderId(candidateProvider)) { + return false; + } + if (!isMinimaxVlmProvider(candidateProvider)) { + return false; + } + return candidates.some((shadowCandidate) => + isExecutionAliasCandidateForProvider(shadowCandidate, candidateProvider), + ); +} + export const __testing = { decodeDataUrl, coerceImageAssistantText, @@ -148,6 +192,7 @@ export function resolveImageModelConfigForTool(params: { workspaceDir: params.workspaceDir, providerId: primary.provider, capability: "image", + includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider), }); if (providerDefault) { return [`${primary.provider}/${providerDefault}`]; @@ -158,7 +203,7 @@ export function resolveImageModelConfigForTool(params: { return []; })(); - const autoCandidates = imageToolProviderDeps + const rawAutoCandidates = imageToolProviderDeps .resolveAutoMediaKeyProviders({ cfg: params.cfg, workspaceDir: params.workspaceDir, @@ -170,15 +215,33 @@ export function resolveImageModelConfigForTool(params: { workspaceDir: params.workspaceDir, providerId, capability: "image", + includeConfiguredImageModels: !isMinimaxVlmProvider(providerId), }); return modelId ? `${providerId}/${modelId}` : null; }); + const autoCandidates = rawAutoCandidates.filter( + (candidate) => + !isCanonicalCandidateShadowedByExecutionAlias(candidate, [ + ...primaryCandidates, + ...rawAutoCandidates, + ]), + ); + const defaultPrimaryIsImplicit = !hasExplicitDefaultPrimaryModel(params.cfg); + const primaryAliasCandidates = defaultPrimaryIsImplicit + ? autoCandidates.filter((candidate) => + isExecutionAliasCandidateForProvider(candidate, primary.provider), + ) + : []; + const remainingAutoCandidates = + primaryAliasCandidates.length === 0 + ? autoCandidates + : autoCandidates.filter((candidate) => !primaryAliasCandidates.includes(candidate)); return buildToolModelConfigFromCandidates({ explicit, agentDir: params.agentDir, authStore: params.authStore, - candidates: [...primaryCandidates, ...autoCandidates], + candidates: [...primaryAliasCandidates, ...primaryCandidates, ...remainingAutoCandidates], }); } diff --git a/src/agents/tools/pdf-tool.model-config.test.ts b/src/agents/tools/pdf-tool.model-config.test.ts index 125425f3e14..e9bdad13a49 100644 --- a/src/agents/tools/pdf-tool.model-config.test.ts +++ b/src/agents/tools/pdf-tool.model-config.test.ts @@ -28,6 +28,9 @@ vi.mock("./model-config.helpers.js", () => ({ if (provider === "google") { return Boolean(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY); } + if (provider === "minimax" || provider === "minimax-cn") { + return Boolean(process.env.MINIMAX_API_KEY); + } return false; }, resolveDefaultModelRef: (cfg?: OpenClawConfig) => { @@ -105,4 +108,33 @@ describe("resolvePdfModelConfigForTool", () => { ANTHROPIC_PDF_MODEL, ); }); + + it("does not add configured MiniMax chat models as automatic PDF image fallbacks", () => { + vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); + const cfg = { + ...withDefaultModel("openai/gpt-5.4"), + models: { + providers: { + minimax: { + baseUrl: "https://api.minimax.io/anthropic", + models: [ + { + id: "MiniMax-M2.7", + name: "MiniMax M2.7", + reasoning: false, + input: ["text", "image"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128_000, + maxTokens: 8_192, + }, + ], + }, + }, + }, + } as OpenClawConfig; + + expect(resolvePdfModelConfigForTool({ cfg, agentDir: TEST_AGENT_DIR })).toEqual({ + primary: "minimax/MiniMax-VL-01", + }); + }); }); diff --git a/src/agents/tools/pdf-tool.model-config.ts b/src/agents/tools/pdf-tool.model-config.ts index 8a32a1da401..da9dc09cd77 100644 --- a/src/agents/tools/pdf-tool.model-config.ts +++ b/src/agents/tools/pdf-tool.model-config.ts @@ -5,6 +5,7 @@ import { resolveDefaultMediaModel, } from "../../media-understanding/defaults.js"; import type { AuthProfileStore } from "../auth-profiles/types.js"; +import { isMinimaxVlmProvider } from "../minimax-vlm.js"; import { coerceImageModelConfig, type ImageModelConfig, @@ -45,6 +46,7 @@ function resolveImageCandidateRefs(params: { workspaceDir: params.workspaceDir, providerId, capability: "image", + includeConfiguredImageModels: !isMinimaxVlmProvider(providerId), }); return modelId ? `${providerId}/${modelId}` : null; }) @@ -106,6 +108,7 @@ export function resolvePdfModelConfigForTool(params: { workspaceDir: params.workspaceDir, providerId: primary.provider, capability: "image", + includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider), }); const primarySupportsNativePdf = providerSupportsNativePdfDocument({ cfg: params.cfg, @@ -136,6 +139,7 @@ export function resolvePdfModelConfigForTool(params: { const providerId = providerKey.trim(); if ( !providerId || + isMinimaxVlmProvider(providerId) || !hasAuthForProvider({ provider: providerId, agentDir: params.agentDir, diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index 5ad614a279d..e5507b13c7e 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -1125,6 +1125,26 @@ describe("capability cli", () => { expect(outputs[0]?.kind).toBe("image.description"); }); + it("keeps image describe HTTP URLs as URLs", async () => { + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "image", + "describe", + "--file", + "https://httpbin.org/image/png", + "--json", + ], + }); + + const describeCall = imageDescribeCall(); + expect(describeCall?.filePath).toBe("https://httpbin.org/image/png"); + const output = firstJsonOutput(); + const outputs = output?.outputs as Array>; + expect(outputs[0]?.path).toBe("https://httpbin.org/image/png"); + }); + it("passes image describe prompts through media understanding", async () => { await runRegisteredCli({ register: registerCapabilityCli as (program: Command) => void, @@ -1221,6 +1241,28 @@ describe("capability cli", () => { expect(outputs[0]?.path).toBe("https://example.com/photo.png"); }); + it("keeps explicit-model image describe HTTP URLs as URLs", async () => { + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "image", + "describe", + "--file", + "https://httpbin.org/image/png", + "--model", + "minimax-cn/MiniMax-VL-01", + "--json", + ], + }); + + const describeCall = firstImageDescribeWithModelCall(); + expect(describeCall?.filePath).toBe("https://httpbin.org/image/png"); + expect(describeCall?.provider).toBe("minimax-cn"); + expect(describeCall?.model).toBe("MiniMax-VL-01"); + expect(mocks.describeImageFile).not.toHaveBeenCalled(); + }); + it("passes describe-many prompts to each image", async () => { await runRegisteredCli({ register: registerCapabilityCli as (program: Command) => void, diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index 8fa940437fb..ec2020ce8d3 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -1097,8 +1097,8 @@ async function runImageDescribe(params: { const prompt = normalizeOptionalString(params.prompt); const outputs = await Promise.all( params.files.map(async (filePath) => { - const isRemoteUrl = /^https?:\/\//i.test(filePath.trim()); - const resolvedPath = isRemoteUrl ? filePath.trim() : path.resolve(filePath); + const resolvedPath = resolveImageDescribeInput(filePath); + const isRemoteUrl = /^https?:\/\//i.test(resolvedPath); const result = activeModel ? await describeImageFileWithModel({ filePath: resolvedPath, @@ -1513,6 +1513,11 @@ async function runTtsProviders(transport: CapabilityTransport) { }; } +function resolveImageDescribeInput(filePath: string): string { + const trimmed = filePath.trim(); + return /^https?:\/\//i.test(trimmed) ? trimmed : path.resolve(filePath); +} + async function runTtsPersonas(transport: CapabilityTransport) { if (transport === "gateway") { return await callGateway({ diff --git a/src/media-understanding/attachments.cache.ts b/src/media-understanding/attachments.cache.ts index 037795494f7..6f6ac570701 100644 --- a/src/media-understanding/attachments.cache.ts +++ b/src/media-understanding/attachments.cache.ts @@ -54,6 +54,14 @@ type AttachmentCacheEntry = { let defaultLocalPathRoots: readonly string[] | undefined; +function concreteMime(mime: string | undefined): string | undefined { + const normalized = mime?.trim(); + if (!normalized || normalized.endsWith("/*")) { + return undefined; + } + return normalized; +} + function getDefaultLocalPathRoots(): readonly string[] { defaultLocalPathRoots ??= mergeInboundPathRoots(getDefaultMediaLocalRoots()); return defaultLocalPathRoots; @@ -128,7 +136,7 @@ export class MediaAttachmentCache { entry.buffer = buffer; entry.bufferMime = entry.bufferMime ?? - entry.attachment.mime ?? + concreteMime(entry.attachment.mime) ?? (await detectMime({ buffer, filePath, @@ -169,7 +177,7 @@ export class MediaAttachmentCache { }); entry.buffer = fetched.buffer; entry.bufferMime = - entry.attachment.mime ?? + concreteMime(entry.attachment.mime) ?? fetched.contentType ?? (await detectMime({ buffer: fetched.buffer, diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts index f329d8b0c0f..6dad50f5c78 100644 --- a/src/media-understanding/defaults.test.ts +++ b/src/media-understanding/defaults.test.ts @@ -140,6 +140,30 @@ describe("resolveDefaultMediaModel", () => { "kimi-k2.6", ); }); + + it("prefers configured image models before manifest defaults", () => { + const cfg = { + models: { + providers: { + openrouter: { + models: [{ id: "google/gemini-2.5-flash", input: ["text", "image"] }], + }, + }, + }, + } as never; + + expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image", cfg })).toBe( + "google/gemini-2.5-flash", + ); + expect( + resolveDefaultMediaModel({ + providerId: "openrouter", + capability: "image", + cfg, + includeConfiguredImageModels: false, + }), + ).toBe("auto"); + }); }); describe("resolveAutoMediaKeyProviders", () => { @@ -166,6 +190,36 @@ describe("resolveAutoMediaKeyProviders", () => { ]); }); + it("preserves configured MiniMax CN aliases for image auto discovery", () => { + const providers = resolveAutoMediaKeyProviders({ + capability: "image", + cfg: { + models: { + providers: { + "minimax-cn": { + models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }], + }, + "minimax-portal-cn": { + models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }], + }, + gemini: { + models: [{ id: "gemini-3-flash-preview", input: ["text", "image"] }], + }, + }, + }, + } as never, + }); + + expect(providers).toContain("minimax-cn"); + expect(providers).toContain("minimax-portal-cn"); + expect(providers).not.toContain("gemini"); + expect(providers).toContain("google"); + expect(providers.indexOf("minimax-cn")).toBeLessThan(providers.indexOf("minimax")); + expect(providers.indexOf("minimax-portal-cn")).toBeLessThan( + providers.indexOf("minimax-portal"), + ); + }); + it("keeps the bundled video fallback order", () => { expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual([ "google", diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index e101a3ffb55..e310ab3df3b 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -2,7 +2,10 @@ import { resolveRuntimeConfigCacheKey } from "../config/runtime-snapshot.js"; import type { OpenClawConfig } from "../config/types.js"; import { normalizeOptionalString } from "../shared/string-coerce.js"; import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js"; -import { normalizeMediaProviderId } from "./provider-registry.js"; +import { + normalizeMediaExecutionProviderId, + normalizeMediaProviderId, +} from "./provider-registry.js"; import { providerSupportsCapability } from "./provider-supports.js"; import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; export { @@ -65,11 +68,11 @@ function resolveConfiguredImageProviderModel(params: { cfg?: OpenClawConfig; providerId: string; }): string | undefined { + const normalizedProviderId = normalizeMediaProviderId(params.providerId); const providers = params.cfg?.models?.providers; if (!providers || typeof providers !== "object") { return undefined; } - const normalizedProviderId = normalizeMediaProviderId(params.providerId); for (const [providerKey, providerCfg] of Object.entries(providers)) { if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) { continue; @@ -93,7 +96,7 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] { } const configured: string[] = []; for (const [providerKey, providerCfg] of Object.entries(providers)) { - const normalizedProviderId = normalizeMediaProviderId(providerKey); + const normalizedProviderId = normalizeMediaExecutionProviderId(providerKey); if (!normalizedProviderId || configured.includes(normalizedProviderId)) { continue; } @@ -108,14 +111,39 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] { return configured; } +function isExecutionAliasProvider(providerId: string): boolean { + return normalizeMediaProviderId(providerId) !== providerId; +} + +function insertConfiguredImageProviders(params: { + prioritized: string[]; + configured: string[]; +}): string[] { + const merged = [...params.prioritized]; + for (const providerId of params.configured.filter(isExecutionAliasProvider)) { + const canonicalProviderId = normalizeMediaProviderId(providerId); + const canonicalIndex = merged.indexOf(canonicalProviderId); + if (canonicalIndex >= 0) { + merged.splice(canonicalIndex, 0, providerId); + } else { + merged.unshift(providerId); + } + } + for (const providerId of params.configured.filter((id) => !isExecutionAliasProvider(id))) { + merged.push(providerId); + } + return [...new Set(merged)]; +} + export function resolveDefaultMediaModel(params: { providerId: string; capability: MediaUnderstandingCapability; cfg?: OpenClawConfig; workspaceDir?: string; providerRegistry?: Map; + includeConfiguredImageModels?: boolean; }): string | undefined { - if (!params.providerRegistry) { + if (!params.providerRegistry && params.includeConfiguredImageModels !== false) { const configuredImageModel = params.capability === "image" ? resolveConfiguredImageProviderModel({ @@ -130,7 +158,13 @@ export function resolveDefaultMediaModel(params: { const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg, params.workspaceDir); const provider = registry.get(normalizeMediaProviderId(params.providerId)); - return normalizeOptionalString(provider?.defaultModels?.[params.capability]); + const manifestDefaultModel = normalizeOptionalString( + provider?.defaultModels?.[params.capability], + ); + if (manifestDefaultModel) { + return manifestDefaultModel; + } + return undefined; } export function resolveAutoMediaKeyProviders(params: { @@ -165,7 +199,10 @@ export function resolveAutoMediaKeyProviders(params: { if (params.providerRegistry || params.capability !== "image") { return prioritized; } - return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])]; + return insertConfiguredImageProviders({ + prioritized, + configured: resolveConfiguredImageProviderIds(params.cfg), + }); } export function providerSupportsNativePdfDocument(params: { diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 943ae2291ac..1ed605222e3 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -335,6 +335,135 @@ describe("describeImageWithModel", () => { expect(fetchMock).toHaveBeenCalledOnce(); }); + it("uses canonical MiniMax CN baseUrl for VLM alias fallback", async () => { + const authStorage = { + setRuntimeApiKey: setRuntimeApiKeyMock, + }; + resolveModelAsyncMock.mockResolvedValue({ + authStorage, + modelRegistry: { find: vi.fn(() => null) }, + error: "Unknown model: minimax-cn/MiniMax-VL-01", + }); + + await expect( + describeImageWithModel({ + cfg: { + models: { + providers: { + minimax: { + apiKey: "minimax-test-key", + baseUrl: "https://api.minimaxi.com/anthropic", + models: [], + }, + }, + }, + }, + agentDir: "/tmp/openclaw-agent", + provider: "minimax-cn", + model: "MiniMax-VL-01", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }), + ).resolves.toEqual({ + text: "portal ok", + model: "MiniMax-VL-01", + }); + + expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "minimax", + }), + ); + const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch"); + expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm"); + }); + + it("uses MiniMax CN alias auth when the alias apiKey is a SecretRef", async () => { + const authStorage = { + setRuntimeApiKey: setRuntimeApiKeyMock, + }; + resolveModelAsyncMock.mockResolvedValue({ + authStorage, + modelRegistry: { find: vi.fn(() => null) }, + error: "Unknown model: minimax-cn/MiniMax-VL-01", + }); + + await expect( + describeImageWithModel({ + cfg: { + models: { + providers: { + "minimax-cn": { + apiKey: { source: "file", provider: "default", id: "/providers/minimax-cn/apiKey" }, + baseUrl: "https://api.minimaxi.com/anthropic", + models: [], + }, + }, + }, + }, + agentDir: "/tmp/openclaw-agent", + provider: "minimax-cn", + model: "MiniMax-VL-01", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }), + ).resolves.toEqual({ + text: "portal ok", + model: "MiniMax-VL-01", + }); + + expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "minimax-cn", + }), + ); + const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch"); + expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm"); + }); + + it("does not inherit global MiniMax baseUrl for CN VLM aliases", async () => { + const authStorage = { + setRuntimeApiKey: setRuntimeApiKeyMock, + }; + resolveModelAsyncMock.mockResolvedValue({ + authStorage, + modelRegistry: { find: vi.fn(() => null) }, + error: "Unknown model: minimax-cn/MiniMax-VL-01", + }); + + await expect( + describeImageWithModel({ + cfg: { + models: { + providers: { + minimax: { baseUrl: "https://api.minimax.io/anthropic", models: [] }, + }, + }, + }, + agentDir: "/tmp/openclaw-agent", + provider: "minimax-cn", + model: "MiniMax-VL-01", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }), + ).resolves.toEqual({ + text: "portal ok", + model: "MiniMax-VL-01", + }); + + const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch"); + expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm"); + }); + it("carries workspaceDir through image model and stream resolution", async () => { discoverModelsMock.mockReturnValue({ find: vi.fn(() => ({ diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index a29439fa663..bada17737ea 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -21,11 +21,13 @@ import { coerceImageAssistantText, hasImageReasoningOnlyResponse, } from "../agents/tools/image-tool.helpers.js"; +import { isSecretRef } from "../config/types.secrets.js"; import { buildCopilotIdeHeaders, COPILOT_INTEGRATION_ID, resolveCopilotApiToken, } from "../plugin-sdk/provider-auth.js"; +import { normalizeMediaProviderId } from "./provider-id.js"; import type { ImageDescriptionRequest, ImageDescriptionResult, @@ -315,6 +317,7 @@ function buildImageRequestHeaders(model: Model): Record | u async function describeImagesWithMinimax(params: { apiKey: string; + provider: string; modelId: string; modelBaseUrl?: string; prompt: string; @@ -329,6 +332,7 @@ async function describeImagesWithMinimax(params: { : params.prompt; const text = await minimaxUnderstandImage({ apiKey: params.apiKey, + provider: params.provider, prompt, imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`, modelBaseUrl: params.modelBaseUrl, @@ -354,9 +358,53 @@ function resolveConfiguredProviderBaseUrl( if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) { return direct.baseUrl.trim(); } + const normalizedProvider = normalizeMediaProviderId(provider); + const normalized = cfg.models?.providers?.[normalizedProvider]; + if (typeof normalized?.baseUrl === "string" && normalized.baseUrl.trim()) { + if (isMinimaxCnAlias(provider) && !isMinimaxCnBaseUrl(normalized.baseUrl)) { + return undefined; + } + return normalized.baseUrl.trim(); + } return undefined; } +function isMinimaxCnAlias(provider: string): boolean { + const normalized = provider.trim().toLowerCase(); + return normalized === "minimax-cn" || normalized === "minimax-portal-cn"; +} + +function isMinimaxCnBaseUrl(baseUrl: string): boolean { + const trimmed = baseUrl.trim(); + if (!trimmed) { + return false; + } + try { + const parsed = new URL(/^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`); + return parsed.hostname.toLowerCase() === "api.minimaxi.com"; + } catch { + return false; + } +} + +function hasConfiguredProviderApiKey( + cfg: ImageDescriptionRequest["cfg"], + provider: string, +): boolean { + const apiKey = cfg.models?.providers?.[provider]?.apiKey; + return (typeof apiKey === "string" && apiKey.trim().length > 0) || isSecretRef(apiKey); +} + +function resolveMinimaxVlmAuthProvider( + cfg: ImageDescriptionRequest["cfg"], + provider: string, +): string { + if (!isMinimaxCnAlias(provider) || hasConfiguredProviderApiKey(cfg, provider)) { + return provider; + } + return normalizeMediaProviderId(provider); +} + async function resolveMinimaxVlmFallbackRuntime(params: { cfg: ImageDescriptionRequest["cfg"]; agentDir: string; @@ -365,8 +413,9 @@ async function resolveMinimaxVlmFallbackRuntime(params: { profile?: string; preferredProfile?: string; }): Promise<{ apiKey: string; modelBaseUrl?: string }> { + const authProvider = resolveMinimaxVlmAuthProvider(params.cfg, params.provider); const auth = await resolveApiKeyForProvider({ - provider: params.provider, + provider: authProvider, cfg: params.cfg, profileId: params.profile, preferredProfile: params.preferredProfile, @@ -374,7 +423,7 @@ async function resolveMinimaxVlmFallbackRuntime(params: { ...(params.workspaceDir ? { workspaceDir: params.workspaceDir } : {}), }); return { - apiKey: requireApiKey(auth, params.provider), + apiKey: requireApiKey(auth, authProvider), modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider), }; } @@ -437,6 +486,7 @@ async function describeImagesWithModelInternal( const fallback = await resolveMinimaxVlmFallbackRuntime(params); return await describeImagesWithMinimax({ apiKey: fallback.apiKey, + provider: params.provider, modelId: params.model, modelBaseUrl: fallback.modelBaseUrl, prompt, @@ -448,6 +498,7 @@ async function describeImagesWithModelInternal( if (isMinimaxVlmModel(model.provider, model.id)) { return await describeImagesWithMinimax({ apiKey, + provider: model.provider, modelId: model.id, modelBaseUrl: model.baseUrl, prompt, diff --git a/src/media-understanding/media-understanding-misc.test.ts b/src/media-understanding/media-understanding-misc.test.ts index d1ac691576a..1cd9111bd27 100644 --- a/src/media-understanding/media-understanding-misc.test.ts +++ b/src/media-understanding/media-understanding-misc.test.ts @@ -107,6 +107,28 @@ describe("media understanding attachments SSRF", () => { expect(fetchSpy).toHaveBeenCalledTimes(1); }); + it("uses fetched content type instead of wildcard selection hints", async () => { + const url = "http://198.18.0.153/image"; + const fetchSpy = vi.fn().mockResolvedValue( + new Response("image", { + headers: { "content-type": "image/png" }, + }), + ); + globalThis.fetch = withFetchPreconnect(fetchSpy); + const cache = new MediaAttachmentCache([{ index: 0, url, mime: "image/*" }], { + ssrfPolicy: { allowRfc2544BenchmarkRange: true }, + }); + + const result = await cache.getBuffer({ + attachmentIndex: 0, + maxBytes: 1024, + timeoutMs: 1000, + }); + + expect(result.mime).toBe("image/png"); + expect(result.fileName).toBe("image.png"); + }); + it("reads local attachments inside configured roots", async () => { await withLocalAttachmentCache("openclaw-media-cache-allowed-", async ({ cache }) => { const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 }); diff --git a/src/media-understanding/provider-id.ts b/src/media-understanding/provider-id.ts index 777fbeab7ba..c48152f9bf6 100644 --- a/src/media-understanding/provider-id.ts +++ b/src/media-understanding/provider-id.ts @@ -5,5 +5,19 @@ export function normalizeMediaProviderId(id: string): string { if (normalized === "gemini") { return "google"; } + if (normalized === "minimax-cn") { + return "minimax"; + } + if (normalized === "minimax-portal-cn") { + return "minimax-portal"; + } return normalized; } + +export function normalizeMediaExecutionProviderId(id: string): string { + const normalized = normalizeProviderId(id); + if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") { + return normalized; + } + return normalizeMediaProviderId(normalized); +} diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts index ac1688fc9a7..9c3c6d9e724 100644 --- a/src/media-understanding/provider-registry.ts +++ b/src/media-understanding/provider-registry.ts @@ -41,7 +41,7 @@ function hydrateModelBackedMediaProvider( }; } -export { normalizeMediaProviderId } from "./provider-id.js"; +export { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js"; export function buildMediaUnderstandingRegistry( overrides?: Record, diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index eba690b226e..76b6cbfd27d 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -34,6 +34,7 @@ import { MediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; import { describeImageWithModel } from "./image-runtime.js"; import { extractGeminiResponse } from "./output-extract.js"; +import { normalizeMediaExecutionProviderId } from "./provider-id.js"; import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js"; import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js"; import type { @@ -566,6 +567,7 @@ export async function runProviderEntry(params: { throw new Error(`Provider entry missing provider for ${capability}`); } const providerId = normalizeMediaProviderId(providerIdRaw); + const requestProviderId = normalizeMediaExecutionProviderId(providerIdRaw); const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({ capability, entry, @@ -587,13 +589,13 @@ export async function runProviderEntry(params: { timeoutMs, }); const requestOverrides = resolveMediaRequestOverrides(params.config); - const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); + const provider = getMediaUnderstandingProvider(requestProviderId, params.providerRegistry); const imageInput = { buffer: media.buffer, fileName: media.fileName, mime: media.mime, model: modelId, - provider: providerId, + provider: requestProviderId, prompt: requestOverrides.prompt ?? prompt, timeoutMs, profile: entry.profile, @@ -608,7 +610,7 @@ export async function runProviderEntry(params: { kind: "image.description", attachmentIndex: params.attachmentIndex, text: trimOutput(result.text, maxChars), - provider: providerId, + provider: requestProviderId, model: result.model ?? modelId, }; } diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 61198867d37..a9ceb41682d 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -2,6 +2,7 @@ import { constants as fsConstants } from "node:fs"; import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; +import { isMinimaxVlmModel, isMinimaxVlmProvider } from "../agents/minimax-vlm.js"; import { findNormalizedProviderValue } from "../agents/provider-id.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { @@ -26,7 +27,7 @@ import { MediaAttachmentCache, selectAttachments } from "./attachments.js"; import { isMediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; import { extractGeminiResponse } from "./output-extract.js"; -import { normalizeMediaProviderId } from "./provider-id.js"; +import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js"; import { buildMediaUnderstandingRegistry, getMediaUnderstandingProvider, @@ -73,7 +74,7 @@ function resolveLiteralProviderApiKey( cfg: OpenClawConfig | undefined, providerId: string, ): string | null { - const value = cfg?.models?.providers?.[providerId]?.apiKey; + const value = findNormalizedProviderValue(cfg?.models?.providers, providerId)?.apiKey; return typeof value === "string" && value.trim().length > 0 ? value.trim() : null; } @@ -98,11 +99,14 @@ function resolveConfiguredKeyProviderOrder(params: { fallbackProviders: readonly string[]; }): string[] { const configuredProviders = Object.keys(params.cfg.models?.providers ?? {}) - .map((providerId) => normalizeMediaProviderId(providerId)) + .map((providerId) => normalizeMediaExecutionProviderId(providerId)) .filter(Boolean) .filter((providerId, index, values) => values.indexOf(providerId) === index) .filter((providerId) => - providerSupportsCapability(params.providerRegistry.get(providerId), params.capability), + providerSupportsCapability( + params.providerRegistry.get(normalizeMediaProviderId(providerId)), + params.capability, + ), ); return [...new Set([...configuredProviders, ...params.fallbackProviders])]; @@ -112,6 +116,9 @@ function resolveConfiguredImageModelId(params: { cfg: OpenClawConfig; providerId: string; }): string | undefined { + if (isMinimaxVlmProvider(params.providerId)) { + return undefined; + } const configured = resolveConfiguredImageModel(params); const id = configured?.id?.trim(); return id || undefined; @@ -145,7 +152,7 @@ function resolveCatalogImageModelId(params: { }): string | undefined { const matches = params.catalog.filter( (entry) => - normalizeMediaProviderId(entry.provider) === params.providerId && + normalizeMediaProviderId(entry.provider) === normalizeMediaProviderId(params.providerId) && params.modelSupportsVision(entry), ); if (matches.length === 0) { @@ -200,6 +207,12 @@ async function explicitImageModelVisionStatus(params: { providerId: string; model: string; }): Promise<"supported" | "unsupported" | "unknown"> { + if ( + isMinimaxVlmProvider(params.providerId) && + !isMinimaxVlmModel(params.providerId, params.model) + ) { + return "unsupported"; + } const configured = resolveConfiguredImageModel(params); if (configured?.id?.trim() === params.model && configured.input?.includes("image")) { return "supported"; @@ -231,6 +244,9 @@ async function resolveAutoImageModelId(params: { return explicit; } } + if (isMinimaxVlmProvider(params.providerId)) { + return "MiniMax-VL-01"; + } const configuredModel = resolveConfiguredImageModelId(params); if (configuredModel) { return configuredModel; @@ -736,7 +752,7 @@ async function resolveActiveModelEntry(params: { if (!activeProviderRaw) { return null; } - const providerId = normalizeMediaProviderId(activeProviderRaw); + const providerId = normalizeMediaExecutionProviderId(activeProviderRaw); if (!providerId) { return null; } @@ -940,6 +956,7 @@ export async function runCapability(params: { if ( capability === "image" && activeProvider && + !isMinimaxVlmProvider(activeProvider) && !hasExplicitImageUnderstandingConfig({ cfg, config }) ) { const { findModelInCatalog, loadModelCatalog, modelSupportsVision } = diff --git a/src/media-understanding/runner.vision-skip.test.ts b/src/media-understanding/runner.vision-skip.test.ts index 305852fe737..83eb7405ed6 100644 --- a/src/media-understanding/runner.vision-skip.test.ts +++ b/src/media-understanding/runner.vision-skip.test.ts @@ -12,6 +12,7 @@ import { createEmptyPluginRegistry } from "../plugins/registry.js"; import { setActivePluginRegistry } from "../plugins/runtime.js"; import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js"; import { withMediaFixture } from "./runner.test-utils.js"; +import type { MediaUnderstandingProvider } from "./types.js"; type TestCatalogEntry = { id: string; @@ -273,7 +274,7 @@ describe("runCapability image skip", () => { imageModel: { primary: "openrouter/google/gemini-2.5-flash" }, }, }, - } as OpenClawConfig; + } as unknown as OpenClawConfig; await expect( resolveAutoImageModel({ @@ -286,13 +287,13 @@ describe("runCapability image skip", () => { }); }); - it("falls back from an active text model to the provider image default", async () => { + it("falls back from a MiniMax chat model to the provider image default", async () => { catalog = [ { id: "MiniMax-M2.7", name: "MiniMax M2.7", provider: "minimax-portal", - input: ["text"] as const, + input: ["text", "image"] as const, }, { id: "MiniMax-VL-01", @@ -302,7 +303,20 @@ describe("runCapability image skip", () => { }, ]; vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key"); - const cfg = {} as OpenClawConfig; + const cfg = { + models: { + providers: { + "minimax-portal": { + models: [ + { + id: "MiniMax-M2.7", + input: ["text", "image"], + }, + ], + }, + }, + }, + } as unknown as OpenClawConfig; const pluginRegistry = createEmptyPluginRegistry(); pluginRegistry.mediaUnderstandingProviders.push({ pluginId: "minimax", @@ -333,6 +347,300 @@ describe("runCapability image skip", () => { } }); + it("does not native-skip MiniMax chat models that claim image input", async () => { + catalog = [ + { + id: "MiniMax-M2.7", + name: "MiniMax M2.7", + provider: "minimax-portal", + input: ["text", "image"] as const, + }, + ]; + vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key"); + const cfg = { + models: { + providers: { + "minimax-portal": { + models: [ + { + id: "MiniMax-M2.7", + input: ["text", "image"], + }, + ], + }, + }, + }, + } as unknown as OpenClawConfig; + const pluginRegistry = createEmptyPluginRegistry(); + pluginRegistry.mediaUnderstandingProviders.push({ + pluginId: "minimax", + pluginName: "MiniMax Provider", + source: "test", + provider: { + id: "minimax-portal", + capabilities: ["image"], + defaultModels: { image: "MiniMax-VL-01" }, + describeImage: async (req) => ({ text: "vlm ok", model: req.model }), + }, + }); + setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg); + + try { + await withMediaFixture( + { + filePrefix: "openclaw-minimax-vlm-no-native-skip", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: buildProviderRegistry(undefined, cfg), + activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" }, + }); + + expect(result.decision.outcome).toBe("success"); + expect(requireCapabilityOutput(result, 0)).toEqual({ + kind: "image.description", + attachmentIndex: 0, + provider: "minimax-portal", + model: "MiniMax-VL-01", + text: "vlm ok", + }); + }, + ); + } finally { + setActivePluginRegistry(createEmptyPluginRegistry()); + vi.unstubAllEnvs(); + } + }); + + it("preserves MiniMax CN aliases from configured provider routing", async () => { + const seenProviders: string[] = []; + const cfg = { + models: { + providers: { + "minimax-cn": { + apiKey: "test-minimax-key", + baseUrl: "https://api.minimaxi.com/anthropic", + models: [], + }, + }, + }, + } as OpenClawConfig; + const pluginRegistry = createEmptyPluginRegistry(); + pluginRegistry.mediaUnderstandingProviders.push({ + pluginId: "minimax", + pluginName: "MiniMax Provider", + source: "test", + provider: { + id: "minimax", + capabilities: ["image"], + defaultModels: { image: "MiniMax-VL-01" }, + describeImage: async (req) => { + seenProviders.push(req.provider); + return { text: "cn vlm ok", model: req.model }; + }, + }, + }); + setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg); + + try { + await withMediaFixture( + { + filePrefix: "openclaw-minimax-cn-provider", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: buildProviderRegistry(undefined, cfg), + }); + + expect(result.decision.outcome).toBe("success"); + expect(seenProviders).toEqual(["minimax-cn"]); + expect(requireCapabilityOutput(result, 0)).toEqual({ + kind: "image.description", + attachmentIndex: 0, + provider: "minimax-cn", + model: "MiniMax-VL-01", + text: "cn vlm ok", + }); + }, + ); + } finally { + setActivePluginRegistry(createEmptyPluginRegistry()); + vi.unstubAllEnvs(); + } + }); + + it("keeps MiniMax auto routing on VLM when registry lacks a default model", async () => { + let seenModel: string | undefined; + await withMediaFixture( + { + filePrefix: "openclaw-minimax-vlm-default", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const cfg = { + models: { + providers: { + minimax: { + apiKey: "test-minimax-key", + baseUrl: "https://api.minimax.io/anthropic", + models: [ + { + id: "MiniMax-M2.5", + name: "MiniMax M2.5", + reasoning: false, + input: ["text", "image"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128_000, + maxTokens: 8_192, + }, + ], + }, + }, + }, + } as OpenClawConfig; + + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry: new Map([ + [ + "minimax", + { + id: "minimax", + capabilities: ["image"], + describeImage: async (req) => { + seenModel = req.model; + return { text: "vlm ok", model: req.model }; + }, + }, + ], + ]), + }); + + expect(result.decision.outcome).toBe("success"); + expect(seenModel).toBe("MiniMax-VL-01"); + expect(requireCapabilityOutput(result, 0)).toMatchObject({ + provider: "minimax", + model: "MiniMax-VL-01", + text: "vlm ok", + }); + }, + ); + }); + + it("keeps non-MiniMax media aliases canonical for image execution", async () => { + const seenProviders: string[] = []; + const cfg = { + tools: { + media: { + image: { + models: [{ provider: "gemini", model: "gemini-3-flash-preview" }], + }, + }, + }, + } as OpenClawConfig; + const providerRegistry = new Map([ + [ + "google", + { + id: "google", + capabilities: ["image" as const], + describeImage: async (req) => { + seenProviders.push(req.provider); + return { text: "google ok", model: req.model }; + }, + }, + ], + ]); + + await withMediaFixture( + { + filePrefix: "openclaw-gemini-media-alias", + extension: "png", + mediaType: "image/png", + fileContents: Buffer.from("image"), + }, + async ({ ctx, media, cache }) => { + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + agentDir: "/tmp", + providerRegistry, + }); + + expect(result.decision.outcome).toBe("success"); + expect(seenProviders).toEqual(["google"]); + expect(requireCapabilityOutput(result, 0)).toEqual({ + kind: "image.description", + attachmentIndex: 0, + provider: "google", + model: "gemini-3-flash-preview", + text: "google ok", + }); + }, + ); + }); + + it("canonicalizes non-MiniMax active media aliases for auto image resolution", async () => { + vi.stubEnv("GEMINI_API_KEY", "test-gemini-key"); + const cfg = {} as OpenClawConfig; + const pluginRegistry = createEmptyPluginRegistry(); + pluginRegistry.mediaUnderstandingProviders.push({ + pluginId: "google", + pluginName: "Google Provider", + source: "test", + provider: { + id: "google", + capabilities: ["image"], + defaultModels: { image: "gemini-3-flash-preview" }, + describeImage: async () => ({ text: "ok" }), + }, + }); + setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg); + + try { + await expect( + resolveAutoImageModel({ + cfg, + activeModel: { provider: "gemini", model: "gemini-3-flash-preview" }, + }), + ).resolves.toEqual({ + provider: "google", + model: "gemini-3-flash-preview", + }); + } finally { + setActivePluginRegistry(createEmptyPluginRegistry()); + vi.unstubAllEnvs(); + } + }); + it("uses active OpenRouter image models for auto image resolution", async () => { vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key"); const cfg = {} as OpenClawConfig; diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts index 8e299f4cce5..3b5347b3e2d 100644 --- a/src/media-understanding/runtime.test.ts +++ b/src/media-understanding/runtime.test.ts @@ -67,6 +67,10 @@ describe("media-understanding runtime", () => { afterEach(() => { mocks.buildProviderRegistry.mockReset(); mocks.createMediaAttachmentCache.mockReset(); + mocks.createMediaAttachmentCache.mockReturnValue({ + cleanup: mocks.cleanup, + getBuffer: mocks.getBuffer, + }); mocks.normalizeMediaAttachments.mockReset(); mocks.normalizeMediaProviderId.mockReset(); mocks.buildMediaUnderstandingRegistry.mockReset(); @@ -186,6 +190,76 @@ describe("media-understanding runtime", () => { expect(mocks.cleanup).toHaveBeenCalledTimes(1); }); + it("classifies extensionless remote image URLs before capability filtering", async () => { + const output: MediaUnderstandingOutput = { + kind: "image.description", + attachmentIndex: 0, + provider: "vision-plugin", + model: "vision-v1", + text: "image ok", + }; + mocks.normalizeMediaAttachments.mockReturnValue([ + { index: 0, url: "https://httpbin.org/image/png", mime: "image/*" }, + ]); + mocks.runCapability.mockResolvedValue({ + outputs: [output], + }); + + await expect( + describeImageFile({ + filePath: "https://httpbin.org/image/png", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + }), + ).resolves.toEqual({ + text: "image ok", + provider: "vision-plugin", + model: "vision-v1", + output, + }); + + expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({ + MediaUrl: "https://httpbin.org/image/png", + MediaType: "image/*", + }); + expect(requireRunCapabilityRequest()).toMatchObject({ + ctx: { + MediaUrl: "https://httpbin.org/image/png", + MediaType: "image/*", + }, + }); + }); + + it("does not force typed remote URLs into the requested capability", async () => { + const media = [{ index: 0, url: "https://example.com/clip.mp4", mime: "video/mp4" }]; + mocks.normalizeMediaAttachments.mockReturnValue(media); + mocks.runCapability.mockResolvedValue({ + outputs: [], + decision: { capability: "image", outcome: "skipped", attachments: [] }, + }); + + await expect( + describeImageFile({ + filePath: "https://example.com/clip.mp4", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + }), + ).resolves.toMatchObject({ + text: undefined, + output: undefined, + }); + + expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({ + MediaUrl: "https://example.com/clip.mp4", + MediaType: "video/mp4", + }); + expect(requireRunCapabilityRequest()).toMatchObject({ + capability: "image", + ctx: { MediaUrl: "https://example.com/clip.mp4", MediaType: "video/mp4" }, + media, + }); + }); + it("passes workspaceDir through file media understanding requests", async () => { const output: MediaUnderstandingOutput = { kind: "image.description", @@ -395,6 +469,7 @@ describe("media-understanding runtime", () => { await describeImageFileWithModel({ filePath: "https://example.com/photo.png", mediaUrl: "https://example.com/photo.png", + mime: "image/*", provider: "zai", model: "glm-4.6v", prompt: "Describe it", @@ -412,6 +487,58 @@ describe("media-understanding runtime", () => { expect(mocks.cleanup).toHaveBeenCalledTimes(1); }); + it("fetches remote explicit image descriptions through the media attachment cache", async () => { + mocks.normalizeMediaAttachments.mockReturnValue([ + { index: 0, url: "https://httpbin.org/image/png", mime: "image/png" }, + ]); + mocks.buildProviderRegistry.mockReturnValue( + new Map([["zai", { id: "zai", capabilities: ["image"] }]]), + ); + mocks.getBuffer.mockResolvedValue({ + buffer: Buffer.from("remote-png"), + fileName: "png", + mime: "image/png", + size: 10, + }); + + await expect( + describeImageFileWithModel({ + filePath: "https://httpbin.org/image/png", + provider: "zai", + model: "glm-4.6v", + prompt: "Describe it", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + timeoutMs: 45_000, + }), + ).resolves.toEqual({ text: "generic image ok", model: "vision" }); + + expect(mocks.readLocalFileSafely).not.toHaveBeenCalled(); + expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({ + MediaUrl: "https://httpbin.org/image/png", + MediaType: "image/*", + }); + expect(mocks.createMediaAttachmentCache).toHaveBeenCalledWith( + [{ index: 0, url: "https://httpbin.org/image/png", mime: "image/png" }], + { ssrfPolicy: undefined }, + ); + expect(mocks.getBuffer).toHaveBeenCalledWith({ + attachmentIndex: 0, + maxBytes: 10 * 1024 * 1024, + timeoutMs: 45_000, + }); + expect(mocks.describeImageWithModel).toHaveBeenCalledWith( + expect.objectContaining({ + buffer: Buffer.from("remote-png"), + fileName: "png", + mime: "image/png", + provider: "zai", + model: "glm-4.6v", + }), + ); + expect(mocks.cleanup).toHaveBeenCalledOnce(); + }); + it("routes direct image description through a provider-specific image hook", async () => { const describeImage = vi.fn(async () => ({ text: "image ok", diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index 6af15949417..0eb408fa2cf 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -1,5 +1,7 @@ import path from "node:path"; +import type { OpenClawConfig } from "../config/types.js"; import { readLocalFileSafely } from "../infra/fs-safe.js"; +import { kindFromMime, mimeTypeFromFilePath } from "../media/mime.js"; import { DEFAULT_MAX_BYTES } from "./defaults.constants.js"; import { describeImageWithModel } from "./image-runtime.js"; import { @@ -48,13 +50,61 @@ function resolveDecisionFailureReason( return normalizeDecisionReason(findDecisionReason(decision, "failed")); } -function buildFileContext(params: { filePath: string; mediaUrl?: string; mime?: string }) { +function buildFileContext(params: { + filePath: string; + mediaUrl?: string; + mime?: string; + capability?: MediaUnderstandingCapability; +}) { + const remoteRef = + params.mediaUrl ?? + (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined); + const extensionMime = remoteRef ? mimeTypeFromFilePath(remoteRef) : undefined; + const extensionKind = kindFromMime(extensionMime); + const mediaType = + params.mime ?? + (remoteRef && params.capability && extensionKind === params.capability + ? `${params.capability}/*` + : extensionMime) ?? + (remoteRef && params.capability ? `${params.capability}/*` : undefined); + if (remoteRef) { + return { + MediaUrl: remoteRef, + MediaType: mediaType, + }; + } return { - ...(params.mediaUrl ? { MediaUrl: params.mediaUrl } : { MediaPath: params.filePath }), - MediaType: params.mime, + MediaPath: params.filePath, + MediaType: mediaType, }; } +function isRemoteMediaReference(value: string): boolean { + return /^https?:\/\//i.test(value.trim()); +} + +function concreteMime(mime: string | undefined): string | undefined { + const normalized = mime?.trim(); + if (!normalized || normalized.endsWith("/*")) { + return undefined; + } + return normalized; +} + +function resolveFileLocalRoots(filePath: string): string[] | undefined { + return isRemoteMediaReference(filePath) ? undefined : [path.dirname(filePath)]; +} + +function basenameFromMediaReference(value: string): string { + if (isRemoteMediaReference(value)) { + try { + const url = new URL(value); + return path.basename(url.pathname) || "image"; + } catch {} + } + return path.basename(value); +} + function hasStructuredImageInput(input: ExtractStructuredWithModelParams["input"]): boolean { return input.some((entry) => entry.type === "image"); } @@ -93,7 +143,7 @@ export async function runMediaUnderstandingFile( }, } : params.cfg; - const ctx = buildFileContext(params); + const ctx = buildFileContext({ ...params, capability: params.capability }); const attachments = normalizeMediaAttachments(ctx); if (attachments.length === 0) { return { @@ -114,7 +164,7 @@ export async function runMediaUnderstandingFile( const providerRegistry = buildProviderRegistry(undefined, cfg); const cache = createMediaAttachmentCache(attachments, { - localPathRoots: [path.dirname(params.filePath)], + localPathRoots: params.mediaUrl ? undefined : resolveFileLocalRoots(params.filePath), ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy, }); @@ -166,33 +216,18 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo const timeoutMs = params.timeoutMs ?? 30_000; const providerRegistry = buildProviderRegistry(undefined, params.cfg); const provider = providerRegistry.get(normalizeMediaProviderId(params.provider)); - let buffer: Buffer; - let fileName = path.basename(params.filePath); - let mime = params.mime; - if (params.mediaUrl) { - const cache = createMediaAttachmentCache(normalizeMediaAttachments(buildFileContext(params)), { - ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy, - }); - try { - const media = await cache.getBuffer({ - attachmentIndex: 0, - maxBytes: DEFAULT_MAX_BYTES.image, - timeoutMs, - }); - buffer = media.buffer; - fileName = media.fileName; - mime = media.mime; - } finally { - await cache.cleanup(); - } - } else { - buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer; - } + const image = await readImageDescriptionInput({ + filePath: params.filePath, + mediaUrl: params.mediaUrl, + mime: params.mime, + cfg: params.cfg, + timeoutMs, + }); const describeImage = provider?.describeImage ?? describeImageWithModel; return await describeImage({ - buffer, - fileName, - mime, + buffer: image.buffer, + fileName: image.fileName, + mime: image.mime, provider: params.provider, model: params.model, prompt: params.prompt, @@ -204,6 +239,45 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo }); } +async function readImageDescriptionInput(params: { + filePath: string; + mediaUrl?: string; + mime?: string; + cfg: OpenClawConfig; + timeoutMs: number; +}): Promise<{ buffer: Buffer; fileName: string; mime?: string }> { + const remoteRef = + params.mediaUrl ?? + (isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined); + if (!remoteRef) { + return { + buffer: (await readLocalFileSafely({ filePath: params.filePath })).buffer, + fileName: basenameFromMediaReference(params.filePath), + mime: params.mime, + }; + } + const attachments = normalizeMediaAttachments( + buildFileContext({ ...params, capability: "image" }), + ); + const cache = createMediaAttachmentCache(attachments, { + ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy, + }); + try { + const media = await cache.getBuffer({ + attachmentIndex: 0, + maxBytes: DEFAULT_MAX_BYTES.image, + timeoutMs: params.timeoutMs, + }); + return { + buffer: media.buffer, + fileName: media.fileName || basenameFromMediaReference(remoteRef), + mime: concreteMime(params.mime) ?? media.mime, + }; + } finally { + await cache.cleanup(); + } +} + export async function extractStructuredWithModel(params: ExtractStructuredWithModelParams) { const timeoutMs = params.timeoutMs ?? 30_000; if (!hasStructuredImageInput(params.input)) {