mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-18 15:14:45 +00:00
fix: route image URL describes through MiniMax VLM
Summary:
- Preserve HTTP image describe inputs as remote media.
- Route MiniMax CN image understanding through MiniMax-VL-01.
- Cover CLI, media runtime, tools, Telegram stickers, docs, and changelog.
Verification:
- codex-review clean
- pnpm check:changed via Blacksmith Testbox tbx_01krtdekwak0mygxbw5z7cfb6z
- PR CI green on 516281448e
This commit is contained in:
committed by
GitHub
parent
9a36e897be
commit
5d1f7bf058
@@ -85,6 +85,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/followups: route queued followup turns through CLI runtime backends instead of embedded harness lookup, preventing `claude-cli`/`google-gemini-cli` followups from failing before delivery. Fixes #82847. (#82857) Thanks @hclsys.
|
||||
- CLI/sessions: let `openclaw sessions cleanup --fix-missing` prune malformed rows with unresolvable transcript metadata instead of throwing. Fixes #80970. (#82745) Thanks @IWhatsskill.
|
||||
- Gateway/usage: refresh large session usage summaries in the background and reuse durable transcript metadata so `sessions.usage` no longer blocks Gateway requests on full transcript rescans. Fixes #82773. (#82778) Thanks @hclsys.
|
||||
- CLI/MiniMax media: let `openclaw infer image describe --file` accept HTTP(S) image URLs without treating them as local paths, and keep automatic MiniMax image understanding routed through `MiniMax-VL-01` even when legacy MiniMax M2.x chat metadata claims image input. Fixes #82837. Thanks @mGaolin.
|
||||
- TUI: restore the submitted draft when chat is busy instead of clearing it or queueing another run. Fixes #45326. (#82774) Thanks @hyspacex.
|
||||
- Cron/memory: treat claimed `before_agent_reply` cron hooks as execution progress, so long memory dreaming promotion jobs are not aborted by the isolated-run pre-execution watchdog. Fixes #82811.
|
||||
- Discord: recover transcript-backed full answers when progress-mode final payloads are ellipsis-truncated, so long replies fall back to normal chunked delivery instead of replacing the preview with a shortened message. Fixes #82807. Thanks @blueberry6401.
|
||||
|
||||
@@ -107,19 +107,19 @@ runtime before the provider request is made.
|
||||
|
||||
This table maps common inference tasks to the corresponding infer command.
|
||||
|
||||
| Task | Command | Notes |
|
||||
| ---------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
|
||||
| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs |
|
||||
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
|
||||
| Describe an image file | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `<provider/model>` |
|
||||
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
|
||||
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
|
||||
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
|
||||
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
|
||||
| Search the web | `openclaw infer web search --query "..." --json` | |
|
||||
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
|
||||
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
|
||||
| Task | Command | Notes |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
|
||||
| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs |
|
||||
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
|
||||
| Describe an image file or URL | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `<provider/model>` |
|
||||
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
|
||||
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
|
||||
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
|
||||
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
|
||||
| Search the web | `openclaw infer web search --query "..." --json` | |
|
||||
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
|
||||
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
|
||||
|
||||
## Behavior
|
||||
|
||||
@@ -128,6 +128,7 @@ This table maps common inference tasks to the corresponding infer command.
|
||||
- Use `--provider` or `--model provider/model` when a specific backend is required.
|
||||
- Use `model run --thinking <level>` to pass a one-shot thinking/reasoning level (`off`, `minimal`, `low`, `medium`, `high`, `adaptive`, `xhigh`, or `max`) while keeping the run raw.
|
||||
- For `image describe`, `audio transcribe`, and `video describe`, `--model` must use the form `<provider/model>`.
|
||||
- For `image describe`, `--file` accepts local paths and HTTP(S) image URLs. Remote URLs use the normal media-fetch SSRF policy.
|
||||
- For `image describe`, an explicit `--model` runs that provider/model directly. The model must be image-capable in the model catalog or provider config. `codex/<model>` runs a bounded Codex app-server image-understanding turn; `openai-codex/<model>` uses the OpenAI Codex OAuth provider path.
|
||||
- Stateless execution commands default to local.
|
||||
- Gateway-managed state commands default to gateway.
|
||||
@@ -192,6 +193,7 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000
|
||||
openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json
|
||||
openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json
|
||||
openclaw infer image describe --file ./photo.jpg --json
|
||||
openclaw infer image describe --file https://example.com/photo.png --json
|
||||
openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json
|
||||
openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json
|
||||
openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json
|
||||
|
||||
@@ -260,8 +260,8 @@ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
|
||||
<Note>
|
||||
**MiniMax note**
|
||||
|
||||
- `minimax` and `minimax-portal` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
|
||||
- The bundled MiniMax text catalog still starts text-only; explicit `models.providers.minimax` entries materialize image-capable M2.7 chat refs.
|
||||
- `minimax`, `minimax-cn`, `minimax-portal`, and `minimax-portal-cn` image understanding comes from the plugin-owned `MiniMax-VL-01` media provider.
|
||||
- Automatic image routing keeps using `MiniMax-VL-01` even if legacy MiniMax M2.x chat metadata claims image input.
|
||||
|
||||
</Note>
|
||||
|
||||
|
||||
125
extensions/telegram/src/sticker-cache.describe.test.ts
Normal file
125
extensions/telegram/src/sticker-cache.describe.test.ts
Normal file
@@ -0,0 +1,125 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { describeStickerImage } from "./sticker-cache.js";
|
||||
|
||||
const mocks = vi.hoisted(() => {
|
||||
const describeImageFileWithModel = vi.fn(async () => ({
|
||||
text: "vlm ok",
|
||||
model: "MiniMax-VL-01",
|
||||
}));
|
||||
return {
|
||||
describeImageFileWithModel,
|
||||
findModelInCatalog: vi.fn((_catalog, provider: string, model: string) => ({
|
||||
provider,
|
||||
id: model,
|
||||
input: ["text", "image"],
|
||||
})),
|
||||
loadModelCatalog: vi.fn(async () => [
|
||||
{ provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
|
||||
{ provider: "minimax", id: "MiniMax-M2.7", input: ["text", "image"] },
|
||||
]),
|
||||
modelSupportsVision: vi.fn((entry: { input?: string[] } | undefined) =>
|
||||
Boolean(entry?.input?.includes("image")),
|
||||
),
|
||||
resolveApiKeyForProvider: vi.fn(async () => ({ apiKey: "minimax-test" })),
|
||||
resolveAutoImageModel: vi.fn(async () => ({
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
})),
|
||||
resolveAutoMediaKeyProviders: vi.fn(() => ["minimax-cn", "minimax"]),
|
||||
resolveDefaultMediaModel: vi.fn(() => "MiniMax-VL-01"),
|
||||
resolveDefaultModelForAgent: vi.fn(() => ({
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-M2.7",
|
||||
})),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({
|
||||
findModelInCatalog: mocks.findModelInCatalog,
|
||||
loadModelCatalog: mocks.loadModelCatalog,
|
||||
modelSupportsVision: mocks.modelSupportsVision,
|
||||
resolveApiKeyForProvider: mocks.resolveApiKeyForProvider,
|
||||
resolveDefaultModelForAgent: mocks.resolveDefaultModelForAgent,
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
resolveAutoImageModel: mocks.resolveAutoImageModel,
|
||||
resolveAutoMediaKeyProviders: mocks.resolveAutoMediaKeyProviders,
|
||||
resolveDefaultMediaModel: mocks.resolveDefaultMediaModel,
|
||||
}));
|
||||
|
||||
vi.mock("./runtime.js", () => ({
|
||||
getTelegramRuntime: () => ({
|
||||
mediaUnderstanding: {
|
||||
describeImageFileWithModel: mocks.describeImageFileWithModel,
|
||||
},
|
||||
}),
|
||||
}));
|
||||
|
||||
describe("describeStickerImage", () => {
|
||||
beforeEach(() => {
|
||||
mocks.describeImageFileWithModel.mockClear();
|
||||
mocks.findModelInCatalog.mockClear();
|
||||
mocks.loadModelCatalog.mockClear();
|
||||
mocks.modelSupportsVision.mockClear();
|
||||
mocks.resolveApiKeyForProvider.mockClear();
|
||||
mocks.resolveAutoImageModel.mockClear();
|
||||
mocks.resolveAutoMediaKeyProviders.mockClear();
|
||||
mocks.resolveDefaultMediaModel.mockClear();
|
||||
mocks.resolveDefaultModelForAgent.mockClear();
|
||||
});
|
||||
|
||||
it("uses MiniMax VLM auto selection instead of legacy chat vision catalog entries", async () => {
|
||||
await expect(
|
||||
describeStickerImage({
|
||||
imagePath: "/tmp/sticker.webp",
|
||||
cfg: {},
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).resolves.toBe("vlm ok");
|
||||
|
||||
expect(mocks.resolveDefaultMediaModel).toHaveBeenCalledWith({
|
||||
cfg: {},
|
||||
providerId: "minimax-cn",
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: false,
|
||||
});
|
||||
expect(mocks.resolveAutoImageModel).not.toHaveBeenCalled();
|
||||
expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
filePath: "/tmp/sticker.webp",
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps MiniMax chat defaults on MiniMax VLM when other vision providers are configured", async () => {
|
||||
mocks.resolveAutoMediaKeyProviders.mockReturnValue(["openai", "minimax-cn", "minimax"]);
|
||||
mocks.loadModelCatalog.mockResolvedValue([
|
||||
{ provider: "openai", id: "gpt-5.4", input: ["text", "image"] },
|
||||
{ provider: "minimax-cn", id: "MiniMax-M2.7", input: ["text", "image"] },
|
||||
{ provider: "minimax-cn", id: "MiniMax-VL-01", input: ["image"] },
|
||||
]);
|
||||
|
||||
await expect(
|
||||
describeStickerImage({
|
||||
imagePath: "/tmp/sticker.webp",
|
||||
cfg: {},
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).resolves.toBe("vlm ok");
|
||||
|
||||
expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
}),
|
||||
);
|
||||
expect(mocks.describeImageFileWithModel).not.toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "openai",
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -27,6 +27,16 @@ export {
|
||||
const STICKER_DESCRIPTION_PROMPT =
|
||||
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
|
||||
|
||||
function isMinimaxVlmProvider(provider: string): boolean {
|
||||
const normalized = normalizeLowercaseStringOrEmpty(provider);
|
||||
return (
|
||||
normalized === "minimax" ||
|
||||
normalized === "minimax-cn" ||
|
||||
normalized === "minimax-portal" ||
|
||||
normalized === "minimax-portal-cn"
|
||||
);
|
||||
}
|
||||
|
||||
export interface DescribeStickerParams {
|
||||
imagePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
@@ -50,7 +60,17 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
||||
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
||||
const supportsVision = modelSupportsVision(entry);
|
||||
if (supportsVision) {
|
||||
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
|
||||
const model = isMinimaxVlmProvider(defaultModel.provider)
|
||||
? resolveDefaultMediaModel({
|
||||
cfg,
|
||||
providerId: defaultModel.provider,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: false,
|
||||
})
|
||||
: defaultModel.model;
|
||||
if (model) {
|
||||
activeModel = { provider: defaultModel.provider, model };
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Ignore catalog failures; fall back to auto selection.
|
||||
@@ -83,8 +103,12 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
||||
cfg,
|
||||
providerId: provider,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: !isMinimaxVlmProvider(provider),
|
||||
});
|
||||
const preferred = entries.find((entry) => entry.id === defaultId);
|
||||
if (isMinimaxVlmProvider(provider)) {
|
||||
return preferred;
|
||||
}
|
||||
return preferred ?? entries[0];
|
||||
};
|
||||
|
||||
|
||||
@@ -75,6 +75,61 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
|
||||
expect(fetchSpy).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it.each(["minimax-cn", "minimax-portal-cn"])(
|
||||
"routes %s to the CN VLM host by default",
|
||||
async (provider) => {
|
||||
const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
|
||||
const requestUrl =
|
||||
typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
|
||||
expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
|
||||
return new Response(apiResponse, {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
});
|
||||
global.fetch = withFetchPreconnect(fetchSpy);
|
||||
|
||||
await expect(
|
||||
minimaxUnderstandImage({
|
||||
apiKey: "minimax-test-key",
|
||||
provider,
|
||||
prompt: "hi",
|
||||
imageDataUrl: "data:image/png;base64,AAAA",
|
||||
}),
|
||||
).resolves.toBe("ok");
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalledOnce();
|
||||
},
|
||||
);
|
||||
|
||||
it.each(["minimax-cn", "minimax-portal-cn"])(
|
||||
"keeps %s on the CN VLM host when the configured host is malformed",
|
||||
async (provider) => {
|
||||
const fetchSpy = vi.fn(async (input: RequestInfo | URL) => {
|
||||
const requestUrl =
|
||||
typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
|
||||
expect(requestUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
|
||||
return new Response(apiResponse, {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
});
|
||||
global.fetch = withFetchPreconnect(fetchSpy);
|
||||
|
||||
await expect(
|
||||
minimaxUnderstandImage({
|
||||
apiKey: "minimax-test-key",
|
||||
provider,
|
||||
apiHost: "https://[",
|
||||
prompt: "hi",
|
||||
imageDataUrl: "data:image/png;base64,AAAA",
|
||||
}),
|
||||
).resolves.toBe("ok");
|
||||
|
||||
expect(fetchSpy).toHaveBeenCalledOnce();
|
||||
},
|
||||
);
|
||||
|
||||
it("uses the caller-provided request timeout", async () => {
|
||||
const timeoutSpy = vi.spyOn(AbortSignal, "timeout");
|
||||
const fetchSpy = vi.fn(async () => {
|
||||
@@ -103,7 +158,9 @@ describe("minimaxUnderstandImage apiKey normalization", () => {
|
||||
describe("isMinimaxVlmModel", () => {
|
||||
it("only matches the canonical MiniMax VLM model id", () => {
|
||||
expect(isMinimaxVlmModel("minimax", "MiniMax-VL-01")).toBe(true);
|
||||
expect(isMinimaxVlmModel("minimax-cn", "MiniMax-VL-01")).toBe(true);
|
||||
expect(isMinimaxVlmModel("minimax-portal", "MiniMax-VL-01")).toBe(true);
|
||||
expect(isMinimaxVlmModel("minimax-portal-cn", "MiniMax-VL-01")).toBe(true);
|
||||
expect(isMinimaxVlmModel("minimax-portal", "custom-vision")).toBe(false);
|
||||
expect(isMinimaxVlmModel("openai", "MiniMax-VL-01")).toBe(false);
|
||||
});
|
||||
|
||||
@@ -8,35 +8,54 @@ type MinimaxBaseResp = {
|
||||
};
|
||||
|
||||
export function isMinimaxVlmProvider(provider: string): boolean {
|
||||
return provider === "minimax" || provider === "minimax-portal";
|
||||
const normalized = provider.trim().toLowerCase();
|
||||
return (
|
||||
normalized === "minimax" ||
|
||||
normalized === "minimax-cn" ||
|
||||
normalized === "minimax-portal" ||
|
||||
normalized === "minimax-portal-cn"
|
||||
);
|
||||
}
|
||||
|
||||
export function isMinimaxVlmModel(provider: string, modelId: string): boolean {
|
||||
return isMinimaxVlmProvider(provider) && modelId.trim() === "MiniMax-VL-01";
|
||||
}
|
||||
|
||||
function isMinimaxCnProvider(provider: string | undefined): boolean {
|
||||
const normalized = provider?.trim().toLowerCase();
|
||||
return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
|
||||
}
|
||||
|
||||
function coerceApiHost(params: {
|
||||
apiHost?: string;
|
||||
modelBaseUrl?: string;
|
||||
provider?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
}): string {
|
||||
const env = params.env ?? process.env;
|
||||
const defaultHost = isMinimaxCnProvider(params.provider)
|
||||
? "https://api.minimaxi.com"
|
||||
: "https://api.minimax.io";
|
||||
const raw =
|
||||
params.apiHost?.trim() ||
|
||||
env.MINIMAX_API_HOST?.trim() ||
|
||||
params.modelBaseUrl?.trim() ||
|
||||
"https://api.minimax.io";
|
||||
defaultHost;
|
||||
|
||||
try {
|
||||
const url = new URL(raw);
|
||||
return url.origin;
|
||||
} catch {}
|
||||
|
||||
if (/^[a-z][a-z\d+.-]*:\/\//i.test(raw)) {
|
||||
return defaultHost;
|
||||
}
|
||||
|
||||
try {
|
||||
const url = new URL(`https://${raw}`);
|
||||
return url.origin;
|
||||
} catch {
|
||||
return "https://api.minimax.io";
|
||||
return defaultHost;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,6 +70,7 @@ export async function minimaxUnderstandImage(params: {
|
||||
imageDataUrl: string;
|
||||
apiHost?: string;
|
||||
modelBaseUrl?: string;
|
||||
provider?: string;
|
||||
timeoutMs?: number;
|
||||
}): Promise<string> {
|
||||
const apiKey = normalizeSecretInput(params.apiKey);
|
||||
@@ -72,6 +92,7 @@ export async function minimaxUnderstandImage(params: {
|
||||
const host = coerceApiHost({
|
||||
apiHost: params.apiHost,
|
||||
modelBaseUrl: params.modelBaseUrl,
|
||||
provider: params.provider,
|
||||
});
|
||||
const url = new URL("/v1/coding_plan/vlm", host).toString();
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import type { AssistantMessage } from "@earendil-works/pi-ai";
|
||||
import type { OpenClawConfig } from "../../config/types.openclaw.js";
|
||||
import { estimateBase64DecodedBytes } from "../../media/base64.js";
|
||||
import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
|
||||
import { isMinimaxVlmProvider } from "../minimax-vlm.js";
|
||||
import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
|
||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||
import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";
|
||||
@@ -238,6 +239,9 @@ export function resolveProviderVisionModelFromConfig(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
provider: string;
|
||||
}): string | null {
|
||||
if (isMinimaxVlmProvider(params.provider)) {
|
||||
return null;
|
||||
}
|
||||
const providerCfg = findNormalizedProviderValue(
|
||||
params.cfg?.models?.providers,
|
||||
params.provider,
|
||||
|
||||
@@ -181,7 +181,9 @@ async function createOpenClawCodingToolsWithFreshModules(options?: CreateOpenCla
|
||||
const defaultImageModels = new Map<string, string>([
|
||||
["anthropic", "claude-opus-4-6"],
|
||||
["minimax", "MiniMax-VL-01"],
|
||||
["minimax-cn", "MiniMax-VL-01"],
|
||||
["minimax-portal", "MiniMax-VL-01"],
|
||||
["minimax-portal-cn", "MiniMax-VL-01"],
|
||||
["openai", "gpt-5.4-mini"],
|
||||
["opencode", "gpt-5-nano"],
|
||||
["opencode-go", "kimi-k2.6"],
|
||||
@@ -482,7 +484,9 @@ function installImageUnderstandingProviderStubs(...providers: MediaUnderstanding
|
||||
const defaultImageModels = new Map<string, string>([
|
||||
["anthropic", "claude-opus-4-6"],
|
||||
["minimax", "MiniMax-VL-01"],
|
||||
["minimax-cn", "MiniMax-VL-01"],
|
||||
["minimax-portal", "MiniMax-VL-01"],
|
||||
["minimax-portal-cn", "MiniMax-VL-01"],
|
||||
["openai", "gpt-5.4-mini"],
|
||||
["opencode", "gpt-5-nano"],
|
||||
["opencode-go", "kimi-k2.6"],
|
||||
@@ -764,6 +768,127 @@ describe("image tool implicit imageModel config", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps MiniMax CN chat metadata off automatic image routing", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: { defaults: { model: { primary: "minimax-cn/MiniMax-M2.5" } } },
|
||||
models: {
|
||||
mode: "merge",
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
apiKey: "${MINIMAX_API_KEY}",
|
||||
api: "anthropic-messages",
|
||||
models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const authStore = {
|
||||
version: 1,
|
||||
profiles: {
|
||||
mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
|
||||
miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
|
||||
},
|
||||
} as const;
|
||||
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
|
||||
primary: "minimax-cn/MiniMax-VL-01",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("prefers configured MiniMax CN image alias over canonical auto fallback", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const defaultImageModels = new Map<string, string>([
|
||||
["anthropic", "claude-opus-4-6"],
|
||||
["minimax", "MiniMax-VL-01"],
|
||||
["minimax-cn", "MiniMax-VL-01"],
|
||||
["openai", "gpt-5.4-mini"],
|
||||
]);
|
||||
__testing.setProviderDepsForTest({
|
||||
buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
|
||||
imageProviderHarness.buildProviderRegistry(overrides),
|
||||
getMediaUnderstandingProvider: (
|
||||
id: string,
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
|
||||
describeImageWithModel: describeGenericImageWithModel,
|
||||
describeImagesWithModel: describeGenericImagesWithModel,
|
||||
resolveAutoMediaKeyProviders: ({ capability }) =>
|
||||
capability === "image" ? ["openai", "anthropic", "minimax-cn", "minimax"] : [],
|
||||
resolveDefaultMediaModel: ({ providerId, capability }) =>
|
||||
capability === "image" ? defaultImageModels.get(providerId.toLowerCase()) : undefined,
|
||||
});
|
||||
const cfg: OpenClawConfig = {
|
||||
models: {
|
||||
mode: "merge",
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
apiKey: "${MINIMAX_API_KEY}",
|
||||
api: "anthropic-messages",
|
||||
models: [makeModelDefinition("MiniMax-M2.5", ["text", "image"])],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const authStore = {
|
||||
version: 1,
|
||||
profiles: {
|
||||
mini: { type: "api_key", provider: "minimax-cn", key: "minimax-test" },
|
||||
miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
|
||||
},
|
||||
} as const;
|
||||
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
|
||||
primary: "minimax-cn/MiniMax-VL-01",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps canonical MiniMax fallback when configured CN alias has no image candidate", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
__testing.setProviderDepsForTest({
|
||||
buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
|
||||
imageProviderHarness.buildProviderRegistry(overrides),
|
||||
getMediaUnderstandingProvider: (
|
||||
id: string,
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
|
||||
describeImageWithModel: describeGenericImageWithModel,
|
||||
describeImagesWithModel: describeGenericImagesWithModel,
|
||||
resolveAutoMediaKeyProviders: ({ capability }) =>
|
||||
capability === "image" ? ["minimax"] : [],
|
||||
resolveDefaultMediaModel: ({ providerId, capability }) =>
|
||||
capability === "image" && providerId === "minimax" ? "MiniMax-VL-01" : undefined,
|
||||
});
|
||||
const cfg: OpenClawConfig = {
|
||||
models: {
|
||||
mode: "merge",
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
apiKey: "${MINIMAX_API_KEY}",
|
||||
api: "anthropic-messages",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const authStore = {
|
||||
version: 1,
|
||||
profiles: {
|
||||
miniGlobal: { type: "api_key", provider: "minimax", key: "minimax-test" },
|
||||
},
|
||||
} as const;
|
||||
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir, authStore })).toEqual({
|
||||
primary: "minimax/MiniMax-VL-01",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("passes the configured image timeout to provider calls", async () => {
|
||||
await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
|
||||
@@ -68,6 +68,50 @@ const imageToolProviderDeps = {
|
||||
resolveDefaultMediaModel,
|
||||
};
|
||||
|
||||
function hasExplicitDefaultPrimaryModel(cfg?: OpenClawConfig): boolean {
|
||||
const model = cfg?.agents?.defaults?.model;
|
||||
if (typeof model === "string") {
|
||||
return model.trim().length > 0;
|
||||
}
|
||||
return typeof model?.primary === "string" && model.primary.trim().length > 0;
|
||||
}
|
||||
|
||||
function modelRefProvider(candidate: string | null | undefined): string | undefined {
|
||||
const trimmed = candidate?.trim();
|
||||
if (!trimmed?.includes("/")) {
|
||||
return undefined;
|
||||
}
|
||||
return trimmed.slice(0, trimmed.indexOf("/")).trim();
|
||||
}
|
||||
|
||||
function isExecutionAliasCandidateForProvider(
|
||||
candidate: string | null | undefined,
|
||||
provider: string,
|
||||
): boolean {
|
||||
const candidateProvider = modelRefProvider(candidate);
|
||||
return Boolean(
|
||||
candidateProvider &&
|
||||
candidateProvider !== normalizeMediaProviderId(candidateProvider) &&
|
||||
normalizeMediaProviderId(candidateProvider) === normalizeMediaProviderId(provider),
|
||||
);
|
||||
}
|
||||
|
||||
function isCanonicalCandidateShadowedByExecutionAlias(
|
||||
candidate: string | null | undefined,
|
||||
candidates: readonly (string | null | undefined)[],
|
||||
): boolean {
|
||||
const candidateProvider = modelRefProvider(candidate);
|
||||
if (!candidateProvider || candidateProvider !== normalizeMediaProviderId(candidateProvider)) {
|
||||
return false;
|
||||
}
|
||||
if (!isMinimaxVlmProvider(candidateProvider)) {
|
||||
return false;
|
||||
}
|
||||
return candidates.some((shadowCandidate) =>
|
||||
isExecutionAliasCandidateForProvider(shadowCandidate, candidateProvider),
|
||||
);
|
||||
}
|
||||
|
||||
export const __testing = {
|
||||
decodeDataUrl,
|
||||
coerceImageAssistantText,
|
||||
@@ -148,6 +192,7 @@ export function resolveImageModelConfigForTool(params: {
|
||||
workspaceDir: params.workspaceDir,
|
||||
providerId: primary.provider,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
|
||||
});
|
||||
if (providerDefault) {
|
||||
return [`${primary.provider}/${providerDefault}`];
|
||||
@@ -158,7 +203,7 @@ export function resolveImageModelConfigForTool(params: {
|
||||
return [];
|
||||
})();
|
||||
|
||||
const autoCandidates = imageToolProviderDeps
|
||||
const rawAutoCandidates = imageToolProviderDeps
|
||||
.resolveAutoMediaKeyProviders({
|
||||
cfg: params.cfg,
|
||||
workspaceDir: params.workspaceDir,
|
||||
@@ -170,15 +215,33 @@ export function resolveImageModelConfigForTool(params: {
|
||||
workspaceDir: params.workspaceDir,
|
||||
providerId,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
|
||||
});
|
||||
return modelId ? `${providerId}/${modelId}` : null;
|
||||
});
|
||||
const autoCandidates = rawAutoCandidates.filter(
|
||||
(candidate) =>
|
||||
!isCanonicalCandidateShadowedByExecutionAlias(candidate, [
|
||||
...primaryCandidates,
|
||||
...rawAutoCandidates,
|
||||
]),
|
||||
);
|
||||
const defaultPrimaryIsImplicit = !hasExplicitDefaultPrimaryModel(params.cfg);
|
||||
const primaryAliasCandidates = defaultPrimaryIsImplicit
|
||||
? autoCandidates.filter((candidate) =>
|
||||
isExecutionAliasCandidateForProvider(candidate, primary.provider),
|
||||
)
|
||||
: [];
|
||||
const remainingAutoCandidates =
|
||||
primaryAliasCandidates.length === 0
|
||||
? autoCandidates
|
||||
: autoCandidates.filter((candidate) => !primaryAliasCandidates.includes(candidate));
|
||||
|
||||
return buildToolModelConfigFromCandidates({
|
||||
explicit,
|
||||
agentDir: params.agentDir,
|
||||
authStore: params.authStore,
|
||||
candidates: [...primaryCandidates, ...autoCandidates],
|
||||
candidates: [...primaryAliasCandidates, ...primaryCandidates, ...remainingAutoCandidates],
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -28,6 +28,9 @@ vi.mock("./model-config.helpers.js", () => ({
|
||||
if (provider === "google") {
|
||||
return Boolean(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
|
||||
}
|
||||
if (provider === "minimax" || provider === "minimax-cn") {
|
||||
return Boolean(process.env.MINIMAX_API_KEY);
|
||||
}
|
||||
return false;
|
||||
},
|
||||
resolveDefaultModelRef: (cfg?: OpenClawConfig) => {
|
||||
@@ -105,4 +108,33 @@ describe("resolvePdfModelConfigForTool", () => {
|
||||
ANTHROPIC_PDF_MODEL,
|
||||
);
|
||||
});
|
||||
|
||||
it("does not add configured MiniMax chat models as automatic PDF image fallbacks", () => {
|
||||
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");
|
||||
const cfg = {
|
||||
...withDefaultModel("openai/gpt-5.4"),
|
||||
models: {
|
||||
providers: {
|
||||
minimax: {
|
||||
baseUrl: "https://api.minimax.io/anthropic",
|
||||
models: [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
name: "MiniMax M2.7",
|
||||
reasoning: false,
|
||||
input: ["text", "image"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 128_000,
|
||||
maxTokens: 8_192,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
|
||||
expect(resolvePdfModelConfigForTool({ cfg, agentDir: TEST_AGENT_DIR })).toEqual({
|
||||
primary: "minimax/MiniMax-VL-01",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
resolveDefaultMediaModel,
|
||||
} from "../../media-understanding/defaults.js";
|
||||
import type { AuthProfileStore } from "../auth-profiles/types.js";
|
||||
import { isMinimaxVlmProvider } from "../minimax-vlm.js";
|
||||
import {
|
||||
coerceImageModelConfig,
|
||||
type ImageModelConfig,
|
||||
@@ -45,6 +46,7 @@ function resolveImageCandidateRefs(params: {
|
||||
workspaceDir: params.workspaceDir,
|
||||
providerId,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: !isMinimaxVlmProvider(providerId),
|
||||
});
|
||||
return modelId ? `${providerId}/${modelId}` : null;
|
||||
})
|
||||
@@ -106,6 +108,7 @@ export function resolvePdfModelConfigForTool(params: {
|
||||
workspaceDir: params.workspaceDir,
|
||||
providerId: primary.provider,
|
||||
capability: "image",
|
||||
includeConfiguredImageModels: !isMinimaxVlmProvider(primary.provider),
|
||||
});
|
||||
const primarySupportsNativePdf = providerSupportsNativePdfDocument({
|
||||
cfg: params.cfg,
|
||||
@@ -136,6 +139,7 @@ export function resolvePdfModelConfigForTool(params: {
|
||||
const providerId = providerKey.trim();
|
||||
if (
|
||||
!providerId ||
|
||||
isMinimaxVlmProvider(providerId) ||
|
||||
!hasAuthForProvider({
|
||||
provider: providerId,
|
||||
agentDir: params.agentDir,
|
||||
|
||||
@@ -1125,6 +1125,26 @@ describe("capability cli", () => {
|
||||
expect(outputs[0]?.kind).toBe("image.description");
|
||||
});
|
||||
|
||||
it("keeps image describe HTTP URLs as URLs", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: [
|
||||
"capability",
|
||||
"image",
|
||||
"describe",
|
||||
"--file",
|
||||
"https://httpbin.org/image/png",
|
||||
"--json",
|
||||
],
|
||||
});
|
||||
|
||||
const describeCall = imageDescribeCall();
|
||||
expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
|
||||
const output = firstJsonOutput();
|
||||
const outputs = output?.outputs as Array<Record<string, unknown>>;
|
||||
expect(outputs[0]?.path).toBe("https://httpbin.org/image/png");
|
||||
});
|
||||
|
||||
it("passes image describe prompts through media understanding", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
@@ -1221,6 +1241,28 @@ describe("capability cli", () => {
|
||||
expect(outputs[0]?.path).toBe("https://example.com/photo.png");
|
||||
});
|
||||
|
||||
it("keeps explicit-model image describe HTTP URLs as URLs", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: [
|
||||
"capability",
|
||||
"image",
|
||||
"describe",
|
||||
"--file",
|
||||
"https://httpbin.org/image/png",
|
||||
"--model",
|
||||
"minimax-cn/MiniMax-VL-01",
|
||||
"--json",
|
||||
],
|
||||
});
|
||||
|
||||
const describeCall = firstImageDescribeWithModelCall();
|
||||
expect(describeCall?.filePath).toBe("https://httpbin.org/image/png");
|
||||
expect(describeCall?.provider).toBe("minimax-cn");
|
||||
expect(describeCall?.model).toBe("MiniMax-VL-01");
|
||||
expect(mocks.describeImageFile).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("passes describe-many prompts to each image", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
|
||||
@@ -1097,8 +1097,8 @@ async function runImageDescribe(params: {
|
||||
const prompt = normalizeOptionalString(params.prompt);
|
||||
const outputs = await Promise.all(
|
||||
params.files.map(async (filePath) => {
|
||||
const isRemoteUrl = /^https?:\/\//i.test(filePath.trim());
|
||||
const resolvedPath = isRemoteUrl ? filePath.trim() : path.resolve(filePath);
|
||||
const resolvedPath = resolveImageDescribeInput(filePath);
|
||||
const isRemoteUrl = /^https?:\/\//i.test(resolvedPath);
|
||||
const result = activeModel
|
||||
? await describeImageFileWithModel({
|
||||
filePath: resolvedPath,
|
||||
@@ -1513,6 +1513,11 @@ async function runTtsProviders(transport: CapabilityTransport) {
|
||||
};
|
||||
}
|
||||
|
||||
function resolveImageDescribeInput(filePath: string): string {
|
||||
const trimmed = filePath.trim();
|
||||
return /^https?:\/\//i.test(trimmed) ? trimmed : path.resolve(filePath);
|
||||
}
|
||||
|
||||
async function runTtsPersonas(transport: CapabilityTransport) {
|
||||
if (transport === "gateway") {
|
||||
return await callGateway({
|
||||
|
||||
@@ -54,6 +54,14 @@ type AttachmentCacheEntry = {
|
||||
|
||||
let defaultLocalPathRoots: readonly string[] | undefined;
|
||||
|
||||
function concreteMime(mime: string | undefined): string | undefined {
|
||||
const normalized = mime?.trim();
|
||||
if (!normalized || normalized.endsWith("/*")) {
|
||||
return undefined;
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function getDefaultLocalPathRoots(): readonly string[] {
|
||||
defaultLocalPathRoots ??= mergeInboundPathRoots(getDefaultMediaLocalRoots());
|
||||
return defaultLocalPathRoots;
|
||||
@@ -128,7 +136,7 @@ export class MediaAttachmentCache {
|
||||
entry.buffer = buffer;
|
||||
entry.bufferMime =
|
||||
entry.bufferMime ??
|
||||
entry.attachment.mime ??
|
||||
concreteMime(entry.attachment.mime) ??
|
||||
(await detectMime({
|
||||
buffer,
|
||||
filePath,
|
||||
@@ -169,7 +177,7 @@ export class MediaAttachmentCache {
|
||||
});
|
||||
entry.buffer = fetched.buffer;
|
||||
entry.bufferMime =
|
||||
entry.attachment.mime ??
|
||||
concreteMime(entry.attachment.mime) ??
|
||||
fetched.contentType ??
|
||||
(await detectMime({
|
||||
buffer: fetched.buffer,
|
||||
|
||||
@@ -140,6 +140,30 @@ describe("resolveDefaultMediaModel", () => {
|
||||
"kimi-k2.6",
|
||||
);
|
||||
});
|
||||
|
||||
it("prefers configured image models before manifest defaults", () => {
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
openrouter: {
|
||||
models: [{ id: "google/gemini-2.5-flash", input: ["text", "image"] }],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as never;
|
||||
|
||||
expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image", cfg })).toBe(
|
||||
"google/gemini-2.5-flash",
|
||||
);
|
||||
expect(
|
||||
resolveDefaultMediaModel({
|
||||
providerId: "openrouter",
|
||||
capability: "image",
|
||||
cfg,
|
||||
includeConfiguredImageModels: false,
|
||||
}),
|
||||
).toBe("auto");
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveAutoMediaKeyProviders", () => {
|
||||
@@ -166,6 +190,36 @@ describe("resolveAutoMediaKeyProviders", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("preserves configured MiniMax CN aliases for image auto discovery", () => {
|
||||
const providers = resolveAutoMediaKeyProviders({
|
||||
capability: "image",
|
||||
cfg: {
|
||||
models: {
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
|
||||
},
|
||||
"minimax-portal-cn": {
|
||||
models: [{ id: "MiniMax-M2.7", input: ["text", "image"] }],
|
||||
},
|
||||
gemini: {
|
||||
models: [{ id: "gemini-3-flash-preview", input: ["text", "image"] }],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
});
|
||||
|
||||
expect(providers).toContain("minimax-cn");
|
||||
expect(providers).toContain("minimax-portal-cn");
|
||||
expect(providers).not.toContain("gemini");
|
||||
expect(providers).toContain("google");
|
||||
expect(providers.indexOf("minimax-cn")).toBeLessThan(providers.indexOf("minimax"));
|
||||
expect(providers.indexOf("minimax-portal-cn")).toBeLessThan(
|
||||
providers.indexOf("minimax-portal"),
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps the bundled video fallback order", () => {
|
||||
expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual([
|
||||
"google",
|
||||
|
||||
@@ -2,7 +2,10 @@ import { resolveRuntimeConfigCacheKey } from "../config/runtime-snapshot.js";
|
||||
import type { OpenClawConfig } from "../config/types.js";
|
||||
import { normalizeOptionalString } from "../shared/string-coerce.js";
|
||||
import { buildMediaUnderstandingManifestMetadataRegistry } from "./manifest-metadata.js";
|
||||
import { normalizeMediaProviderId } from "./provider-registry.js";
|
||||
import {
|
||||
normalizeMediaExecutionProviderId,
|
||||
normalizeMediaProviderId,
|
||||
} from "./provider-registry.js";
|
||||
import { providerSupportsCapability } from "./provider-supports.js";
|
||||
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
|
||||
export {
|
||||
@@ -65,11 +68,11 @@ function resolveConfiguredImageProviderModel(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
providerId: string;
|
||||
}): string | undefined {
|
||||
const normalizedProviderId = normalizeMediaProviderId(params.providerId);
|
||||
const providers = params.cfg?.models?.providers;
|
||||
if (!providers || typeof providers !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const normalizedProviderId = normalizeMediaProviderId(params.providerId);
|
||||
for (const [providerKey, providerCfg] of Object.entries(providers)) {
|
||||
if (normalizeMediaProviderId(providerKey) !== normalizedProviderId) {
|
||||
continue;
|
||||
@@ -93,7 +96,7 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
|
||||
}
|
||||
const configured: string[] = [];
|
||||
for (const [providerKey, providerCfg] of Object.entries(providers)) {
|
||||
const normalizedProviderId = normalizeMediaProviderId(providerKey);
|
||||
const normalizedProviderId = normalizeMediaExecutionProviderId(providerKey);
|
||||
if (!normalizedProviderId || configured.includes(normalizedProviderId)) {
|
||||
continue;
|
||||
}
|
||||
@@ -108,14 +111,39 @@ function resolveConfiguredImageProviderIds(cfg?: OpenClawConfig): string[] {
|
||||
return configured;
|
||||
}
|
||||
|
||||
function isExecutionAliasProvider(providerId: string): boolean {
|
||||
return normalizeMediaProviderId(providerId) !== providerId;
|
||||
}
|
||||
|
||||
function insertConfiguredImageProviders(params: {
|
||||
prioritized: string[];
|
||||
configured: string[];
|
||||
}): string[] {
|
||||
const merged = [...params.prioritized];
|
||||
for (const providerId of params.configured.filter(isExecutionAliasProvider)) {
|
||||
const canonicalProviderId = normalizeMediaProviderId(providerId);
|
||||
const canonicalIndex = merged.indexOf(canonicalProviderId);
|
||||
if (canonicalIndex >= 0) {
|
||||
merged.splice(canonicalIndex, 0, providerId);
|
||||
} else {
|
||||
merged.unshift(providerId);
|
||||
}
|
||||
}
|
||||
for (const providerId of params.configured.filter((id) => !isExecutionAliasProvider(id))) {
|
||||
merged.push(providerId);
|
||||
}
|
||||
return [...new Set(merged)];
|
||||
}
|
||||
|
||||
export function resolveDefaultMediaModel(params: {
|
||||
providerId: string;
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg?: OpenClawConfig;
|
||||
workspaceDir?: string;
|
||||
providerRegistry?: Map<string, MediaUnderstandingProvider>;
|
||||
includeConfiguredImageModels?: boolean;
|
||||
}): string | undefined {
|
||||
if (!params.providerRegistry) {
|
||||
if (!params.providerRegistry && params.includeConfiguredImageModels !== false) {
|
||||
const configuredImageModel =
|
||||
params.capability === "image"
|
||||
? resolveConfiguredImageProviderModel({
|
||||
@@ -130,7 +158,13 @@ export function resolveDefaultMediaModel(params: {
|
||||
const registry =
|
||||
params.providerRegistry ?? resolveDefaultRegistry(params.cfg, params.workspaceDir);
|
||||
const provider = registry.get(normalizeMediaProviderId(params.providerId));
|
||||
return normalizeOptionalString(provider?.defaultModels?.[params.capability]);
|
||||
const manifestDefaultModel = normalizeOptionalString(
|
||||
provider?.defaultModels?.[params.capability],
|
||||
);
|
||||
if (manifestDefaultModel) {
|
||||
return manifestDefaultModel;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function resolveAutoMediaKeyProviders(params: {
|
||||
@@ -165,7 +199,10 @@ export function resolveAutoMediaKeyProviders(params: {
|
||||
if (params.providerRegistry || params.capability !== "image") {
|
||||
return prioritized;
|
||||
}
|
||||
return [...new Set([...prioritized, ...resolveConfiguredImageProviderIds(params.cfg)])];
|
||||
return insertConfiguredImageProviders({
|
||||
prioritized,
|
||||
configured: resolveConfiguredImageProviderIds(params.cfg),
|
||||
});
|
||||
}
|
||||
|
||||
export function providerSupportsNativePdfDocument(params: {
|
||||
|
||||
@@ -335,6 +335,135 @@ describe("describeImageWithModel", () => {
|
||||
expect(fetchMock).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("uses canonical MiniMax CN baseUrl for VLM alias fallback", async () => {
|
||||
const authStorage = {
|
||||
setRuntimeApiKey: setRuntimeApiKeyMock,
|
||||
};
|
||||
resolveModelAsyncMock.mockResolvedValue({
|
||||
authStorage,
|
||||
modelRegistry: { find: vi.fn(() => null) },
|
||||
error: "Unknown model: minimax-cn/MiniMax-VL-01",
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageWithModel({
|
||||
cfg: {
|
||||
models: {
|
||||
providers: {
|
||||
minimax: {
|
||||
apiKey: "minimax-test-key",
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
agentDir: "/tmp/openclaw-agent",
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
buffer: Buffer.from("png-bytes"),
|
||||
fileName: "image.png",
|
||||
mime: "image/png",
|
||||
prompt: "Describe the image.",
|
||||
timeoutMs: 1000,
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
text: "portal ok",
|
||||
model: "MiniMax-VL-01",
|
||||
});
|
||||
|
||||
expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "minimax",
|
||||
}),
|
||||
);
|
||||
const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
|
||||
expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
|
||||
});
|
||||
|
||||
it("uses MiniMax CN alias auth when the alias apiKey is a SecretRef", async () => {
|
||||
const authStorage = {
|
||||
setRuntimeApiKey: setRuntimeApiKeyMock,
|
||||
};
|
||||
resolveModelAsyncMock.mockResolvedValue({
|
||||
authStorage,
|
||||
modelRegistry: { find: vi.fn(() => null) },
|
||||
error: "Unknown model: minimax-cn/MiniMax-VL-01",
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageWithModel({
|
||||
cfg: {
|
||||
models: {
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
apiKey: { source: "file", provider: "default", id: "/providers/minimax-cn/apiKey" },
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
agentDir: "/tmp/openclaw-agent",
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
buffer: Buffer.from("png-bytes"),
|
||||
fileName: "image.png",
|
||||
mime: "image/png",
|
||||
prompt: "Describe the image.",
|
||||
timeoutMs: 1000,
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
text: "portal ok",
|
||||
model: "MiniMax-VL-01",
|
||||
});
|
||||
|
||||
expect(resolveApiKeyForProviderMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "minimax-cn",
|
||||
}),
|
||||
);
|
||||
const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
|
||||
expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
|
||||
});
|
||||
|
||||
it("does not inherit global MiniMax baseUrl for CN VLM aliases", async () => {
|
||||
const authStorage = {
|
||||
setRuntimeApiKey: setRuntimeApiKeyMock,
|
||||
};
|
||||
resolveModelAsyncMock.mockResolvedValue({
|
||||
authStorage,
|
||||
modelRegistry: { find: vi.fn(() => null) },
|
||||
error: "Unknown model: minimax-cn/MiniMax-VL-01",
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageWithModel({
|
||||
cfg: {
|
||||
models: {
|
||||
providers: {
|
||||
minimax: { baseUrl: "https://api.minimax.io/anthropic", models: [] },
|
||||
},
|
||||
},
|
||||
},
|
||||
agentDir: "/tmp/openclaw-agent",
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
buffer: Buffer.from("png-bytes"),
|
||||
fileName: "image.png",
|
||||
mime: "image/png",
|
||||
prompt: "Describe the image.",
|
||||
timeoutMs: 1000,
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
text: "portal ok",
|
||||
model: "MiniMax-VL-01",
|
||||
});
|
||||
|
||||
const [fetchUrl] = requireFirstMockCall(fetchMock, "fetch");
|
||||
expect(fetchUrl).toBe("https://api.minimaxi.com/v1/coding_plan/vlm");
|
||||
});
|
||||
|
||||
it("carries workspaceDir through image model and stream resolution", async () => {
|
||||
discoverModelsMock.mockReturnValue({
|
||||
find: vi.fn(() => ({
|
||||
|
||||
@@ -21,11 +21,13 @@ import {
|
||||
coerceImageAssistantText,
|
||||
hasImageReasoningOnlyResponse,
|
||||
} from "../agents/tools/image-tool.helpers.js";
|
||||
import { isSecretRef } from "../config/types.secrets.js";
|
||||
import {
|
||||
buildCopilotIdeHeaders,
|
||||
COPILOT_INTEGRATION_ID,
|
||||
resolveCopilotApiToken,
|
||||
} from "../plugin-sdk/provider-auth.js";
|
||||
import { normalizeMediaProviderId } from "./provider-id.js";
|
||||
import type {
|
||||
ImageDescriptionRequest,
|
||||
ImageDescriptionResult,
|
||||
@@ -315,6 +317,7 @@ function buildImageRequestHeaders(model: Model<Api>): Record<string, string> | u
|
||||
|
||||
async function describeImagesWithMinimax(params: {
|
||||
apiKey: string;
|
||||
provider: string;
|
||||
modelId: string;
|
||||
modelBaseUrl?: string;
|
||||
prompt: string;
|
||||
@@ -329,6 +332,7 @@ async function describeImagesWithMinimax(params: {
|
||||
: params.prompt;
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey: params.apiKey,
|
||||
provider: params.provider,
|
||||
prompt,
|
||||
imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
|
||||
modelBaseUrl: params.modelBaseUrl,
|
||||
@@ -354,9 +358,53 @@ function resolveConfiguredProviderBaseUrl(
|
||||
if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
|
||||
return direct.baseUrl.trim();
|
||||
}
|
||||
const normalizedProvider = normalizeMediaProviderId(provider);
|
||||
const normalized = cfg.models?.providers?.[normalizedProvider];
|
||||
if (typeof normalized?.baseUrl === "string" && normalized.baseUrl.trim()) {
|
||||
if (isMinimaxCnAlias(provider) && !isMinimaxCnBaseUrl(normalized.baseUrl)) {
|
||||
return undefined;
|
||||
}
|
||||
return normalized.baseUrl.trim();
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function isMinimaxCnAlias(provider: string): boolean {
|
||||
const normalized = provider.trim().toLowerCase();
|
||||
return normalized === "minimax-cn" || normalized === "minimax-portal-cn";
|
||||
}
|
||||
|
||||
function isMinimaxCnBaseUrl(baseUrl: string): boolean {
|
||||
const trimmed = baseUrl.trim();
|
||||
if (!trimmed) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
const parsed = new URL(/^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`);
|
||||
return parsed.hostname.toLowerCase() === "api.minimaxi.com";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function hasConfiguredProviderApiKey(
|
||||
cfg: ImageDescriptionRequest["cfg"],
|
||||
provider: string,
|
||||
): boolean {
|
||||
const apiKey = cfg.models?.providers?.[provider]?.apiKey;
|
||||
return (typeof apiKey === "string" && apiKey.trim().length > 0) || isSecretRef(apiKey);
|
||||
}
|
||||
|
||||
function resolveMinimaxVlmAuthProvider(
|
||||
cfg: ImageDescriptionRequest["cfg"],
|
||||
provider: string,
|
||||
): string {
|
||||
if (!isMinimaxCnAlias(provider) || hasConfiguredProviderApiKey(cfg, provider)) {
|
||||
return provider;
|
||||
}
|
||||
return normalizeMediaProviderId(provider);
|
||||
}
|
||||
|
||||
async function resolveMinimaxVlmFallbackRuntime(params: {
|
||||
cfg: ImageDescriptionRequest["cfg"];
|
||||
agentDir: string;
|
||||
@@ -365,8 +413,9 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
|
||||
profile?: string;
|
||||
preferredProfile?: string;
|
||||
}): Promise<{ apiKey: string; modelBaseUrl?: string }> {
|
||||
const authProvider = resolveMinimaxVlmAuthProvider(params.cfg, params.provider);
|
||||
const auth = await resolveApiKeyForProvider({
|
||||
provider: params.provider,
|
||||
provider: authProvider,
|
||||
cfg: params.cfg,
|
||||
profileId: params.profile,
|
||||
preferredProfile: params.preferredProfile,
|
||||
@@ -374,7 +423,7 @@ async function resolveMinimaxVlmFallbackRuntime(params: {
|
||||
...(params.workspaceDir ? { workspaceDir: params.workspaceDir } : {}),
|
||||
});
|
||||
return {
|
||||
apiKey: requireApiKey(auth, params.provider),
|
||||
apiKey: requireApiKey(auth, authProvider),
|
||||
modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
|
||||
};
|
||||
}
|
||||
@@ -437,6 +486,7 @@ async function describeImagesWithModelInternal(
|
||||
const fallback = await resolveMinimaxVlmFallbackRuntime(params);
|
||||
return await describeImagesWithMinimax({
|
||||
apiKey: fallback.apiKey,
|
||||
provider: params.provider,
|
||||
modelId: params.model,
|
||||
modelBaseUrl: fallback.modelBaseUrl,
|
||||
prompt,
|
||||
@@ -448,6 +498,7 @@ async function describeImagesWithModelInternal(
|
||||
if (isMinimaxVlmModel(model.provider, model.id)) {
|
||||
return await describeImagesWithMinimax({
|
||||
apiKey,
|
||||
provider: model.provider,
|
||||
modelId: model.id,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
prompt,
|
||||
|
||||
@@ -107,6 +107,28 @@ describe("media understanding attachments SSRF", () => {
|
||||
expect(fetchSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("uses fetched content type instead of wildcard selection hints", async () => {
|
||||
const url = "http://198.18.0.153/image";
|
||||
const fetchSpy = vi.fn().mockResolvedValue(
|
||||
new Response("image", {
|
||||
headers: { "content-type": "image/png" },
|
||||
}),
|
||||
);
|
||||
globalThis.fetch = withFetchPreconnect(fetchSpy);
|
||||
const cache = new MediaAttachmentCache([{ index: 0, url, mime: "image/*" }], {
|
||||
ssrfPolicy: { allowRfc2544BenchmarkRange: true },
|
||||
});
|
||||
|
||||
const result = await cache.getBuffer({
|
||||
attachmentIndex: 0,
|
||||
maxBytes: 1024,
|
||||
timeoutMs: 1000,
|
||||
});
|
||||
|
||||
expect(result.mime).toBe("image/png");
|
||||
expect(result.fileName).toBe("image.png");
|
||||
});
|
||||
|
||||
it("reads local attachments inside configured roots", async () => {
|
||||
await withLocalAttachmentCache("openclaw-media-cache-allowed-", async ({ cache }) => {
|
||||
const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
|
||||
|
||||
@@ -5,5 +5,19 @@ export function normalizeMediaProviderId(id: string): string {
|
||||
if (normalized === "gemini") {
|
||||
return "google";
|
||||
}
|
||||
if (normalized === "minimax-cn") {
|
||||
return "minimax";
|
||||
}
|
||||
if (normalized === "minimax-portal-cn") {
|
||||
return "minimax-portal";
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function normalizeMediaExecutionProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
if (normalized === "minimax-cn" || normalized === "minimax-portal-cn") {
|
||||
return normalized;
|
||||
}
|
||||
return normalizeMediaProviderId(normalized);
|
||||
}
|
||||
|
||||
@@ -41,7 +41,7 @@ function hydrateModelBackedMediaProvider(
|
||||
};
|
||||
}
|
||||
|
||||
export { normalizeMediaProviderId } from "./provider-id.js";
|
||||
export { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";
|
||||
|
||||
export function buildMediaUnderstandingRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
|
||||
@@ -34,6 +34,7 @@ import { MediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
import { describeImageWithModel } from "./image-runtime.js";
|
||||
import { extractGeminiResponse } from "./output-extract.js";
|
||||
import { normalizeMediaExecutionProviderId } from "./provider-id.js";
|
||||
import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
|
||||
import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
|
||||
import type {
|
||||
@@ -566,6 +567,7 @@ export async function runProviderEntry(params: {
|
||||
throw new Error(`Provider entry missing provider for ${capability}`);
|
||||
}
|
||||
const providerId = normalizeMediaProviderId(providerIdRaw);
|
||||
const requestProviderId = normalizeMediaExecutionProviderId(providerIdRaw);
|
||||
const { maxBytes, maxChars, timeoutMs, prompt } = resolveEntryRunOptions({
|
||||
capability,
|
||||
entry,
|
||||
@@ -587,13 +589,13 @@ export async function runProviderEntry(params: {
|
||||
timeoutMs,
|
||||
});
|
||||
const requestOverrides = resolveMediaRequestOverrides(params.config);
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
const provider = getMediaUnderstandingProvider(requestProviderId, params.providerRegistry);
|
||||
const imageInput = {
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
model: modelId,
|
||||
provider: providerId,
|
||||
provider: requestProviderId,
|
||||
prompt: requestOverrides.prompt ?? prompt,
|
||||
timeoutMs,
|
||||
profile: entry.profile,
|
||||
@@ -608,7 +610,7 @@ export async function runProviderEntry(params: {
|
||||
kind: "image.description",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
provider: requestProviderId,
|
||||
model: result.model ?? modelId,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ import { constants as fsConstants } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { isMinimaxVlmModel, isMinimaxVlmProvider } from "../agents/minimax-vlm.js";
|
||||
import { findNormalizedProviderValue } from "../agents/provider-id.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import {
|
||||
@@ -26,7 +27,7 @@ import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
|
||||
import { isMediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
import { extractGeminiResponse } from "./output-extract.js";
|
||||
import { normalizeMediaProviderId } from "./provider-id.js";
|
||||
import { normalizeMediaExecutionProviderId, normalizeMediaProviderId } from "./provider-id.js";
|
||||
import {
|
||||
buildMediaUnderstandingRegistry,
|
||||
getMediaUnderstandingProvider,
|
||||
@@ -73,7 +74,7 @@ function resolveLiteralProviderApiKey(
|
||||
cfg: OpenClawConfig | undefined,
|
||||
providerId: string,
|
||||
): string | null {
|
||||
const value = cfg?.models?.providers?.[providerId]?.apiKey;
|
||||
const value = findNormalizedProviderValue(cfg?.models?.providers, providerId)?.apiKey;
|
||||
return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
|
||||
}
|
||||
|
||||
@@ -98,11 +99,14 @@ function resolveConfiguredKeyProviderOrder(params: {
|
||||
fallbackProviders: readonly string[];
|
||||
}): string[] {
|
||||
const configuredProviders = Object.keys(params.cfg.models?.providers ?? {})
|
||||
.map((providerId) => normalizeMediaProviderId(providerId))
|
||||
.map((providerId) => normalizeMediaExecutionProviderId(providerId))
|
||||
.filter(Boolean)
|
||||
.filter((providerId, index, values) => values.indexOf(providerId) === index)
|
||||
.filter((providerId) =>
|
||||
providerSupportsCapability(params.providerRegistry.get(providerId), params.capability),
|
||||
providerSupportsCapability(
|
||||
params.providerRegistry.get(normalizeMediaProviderId(providerId)),
|
||||
params.capability,
|
||||
),
|
||||
);
|
||||
|
||||
return [...new Set([...configuredProviders, ...params.fallbackProviders])];
|
||||
@@ -112,6 +116,9 @@ function resolveConfiguredImageModelId(params: {
|
||||
cfg: OpenClawConfig;
|
||||
providerId: string;
|
||||
}): string | undefined {
|
||||
if (isMinimaxVlmProvider(params.providerId)) {
|
||||
return undefined;
|
||||
}
|
||||
const configured = resolveConfiguredImageModel(params);
|
||||
const id = configured?.id?.trim();
|
||||
return id || undefined;
|
||||
@@ -145,7 +152,7 @@ function resolveCatalogImageModelId(params: {
|
||||
}): string | undefined {
|
||||
const matches = params.catalog.filter(
|
||||
(entry) =>
|
||||
normalizeMediaProviderId(entry.provider) === params.providerId &&
|
||||
normalizeMediaProviderId(entry.provider) === normalizeMediaProviderId(params.providerId) &&
|
||||
params.modelSupportsVision(entry),
|
||||
);
|
||||
if (matches.length === 0) {
|
||||
@@ -200,6 +207,12 @@ async function explicitImageModelVisionStatus(params: {
|
||||
providerId: string;
|
||||
model: string;
|
||||
}): Promise<"supported" | "unsupported" | "unknown"> {
|
||||
if (
|
||||
isMinimaxVlmProvider(params.providerId) &&
|
||||
!isMinimaxVlmModel(params.providerId, params.model)
|
||||
) {
|
||||
return "unsupported";
|
||||
}
|
||||
const configured = resolveConfiguredImageModel(params);
|
||||
if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
|
||||
return "supported";
|
||||
@@ -231,6 +244,9 @@ async function resolveAutoImageModelId(params: {
|
||||
return explicit;
|
||||
}
|
||||
}
|
||||
if (isMinimaxVlmProvider(params.providerId)) {
|
||||
return "MiniMax-VL-01";
|
||||
}
|
||||
const configuredModel = resolveConfiguredImageModelId(params);
|
||||
if (configuredModel) {
|
||||
return configuredModel;
|
||||
@@ -736,7 +752,7 @@ async function resolveActiveModelEntry(params: {
|
||||
if (!activeProviderRaw) {
|
||||
return null;
|
||||
}
|
||||
const providerId = normalizeMediaProviderId(activeProviderRaw);
|
||||
const providerId = normalizeMediaExecutionProviderId(activeProviderRaw);
|
||||
if (!providerId) {
|
||||
return null;
|
||||
}
|
||||
@@ -940,6 +956,7 @@ export async function runCapability(params: {
|
||||
if (
|
||||
capability === "image" &&
|
||||
activeProvider &&
|
||||
!isMinimaxVlmProvider(activeProvider) &&
|
||||
!hasExplicitImageUnderstandingConfig({ cfg, config })
|
||||
) {
|
||||
const { findModelInCatalog, loadModelCatalog, modelSupportsVision } =
|
||||
|
||||
@@ -12,6 +12,7 @@ import { createEmptyPluginRegistry } from "../plugins/registry.js";
|
||||
import { setActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
|
||||
import { withMediaFixture } from "./runner.test-utils.js";
|
||||
import type { MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
type TestCatalogEntry = {
|
||||
id: string;
|
||||
@@ -273,7 +274,7 @@ describe("runCapability image skip", () => {
|
||||
imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
} as unknown as OpenClawConfig;
|
||||
|
||||
await expect(
|
||||
resolveAutoImageModel({
|
||||
@@ -286,13 +287,13 @@ describe("runCapability image skip", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back from an active text model to the provider image default", async () => {
|
||||
it("falls back from a MiniMax chat model to the provider image default", async () => {
|
||||
catalog = [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
name: "MiniMax M2.7",
|
||||
provider: "minimax-portal",
|
||||
input: ["text"] as const,
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
{
|
||||
id: "MiniMax-VL-01",
|
||||
@@ -302,7 +303,20 @@ describe("runCapability image skip", () => {
|
||||
},
|
||||
];
|
||||
vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
|
||||
const cfg = {} as OpenClawConfig;
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
"minimax-portal": {
|
||||
models: [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "minimax",
|
||||
@@ -333,6 +347,300 @@ describe("runCapability image skip", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("does not native-skip MiniMax chat models that claim image input", async () => {
|
||||
catalog = [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
name: "MiniMax M2.7",
|
||||
provider: "minimax-portal",
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
];
|
||||
vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
"minimax-portal": {
|
||||
models: [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "minimax",
|
||||
pluginName: "MiniMax Provider",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "MiniMax-VL-01" },
|
||||
describeImage: async (req) => ({ text: "vlm ok", model: req.model }),
|
||||
},
|
||||
});
|
||||
setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
|
||||
|
||||
try {
|
||||
await withMediaFixture(
|
||||
{
|
||||
filePrefix: "openclaw-minimax-vlm-no-native-skip",
|
||||
extension: "png",
|
||||
mediaType: "image/png",
|
||||
fileContents: Buffer.from("image"),
|
||||
},
|
||||
async ({ ctx, media, cache }) => {
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
agentDir: "/tmp",
|
||||
providerRegistry: buildProviderRegistry(undefined, cfg),
|
||||
activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
|
||||
});
|
||||
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
expect(requireCapabilityOutput(result, 0)).toEqual({
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
provider: "minimax-portal",
|
||||
model: "MiniMax-VL-01",
|
||||
text: "vlm ok",
|
||||
});
|
||||
},
|
||||
);
|
||||
} finally {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
vi.unstubAllEnvs();
|
||||
}
|
||||
});
|
||||
|
||||
it("preserves MiniMax CN aliases from configured provider routing", async () => {
|
||||
const seenProviders: string[] = [];
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
"minimax-cn": {
|
||||
apiKey: "test-minimax-key",
|
||||
baseUrl: "https://api.minimaxi.com/anthropic",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "minimax",
|
||||
pluginName: "MiniMax Provider",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "MiniMax-VL-01" },
|
||||
describeImage: async (req) => {
|
||||
seenProviders.push(req.provider);
|
||||
return { text: "cn vlm ok", model: req.model };
|
||||
},
|
||||
},
|
||||
});
|
||||
setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
|
||||
|
||||
try {
|
||||
await withMediaFixture(
|
||||
{
|
||||
filePrefix: "openclaw-minimax-cn-provider",
|
||||
extension: "png",
|
||||
mediaType: "image/png",
|
||||
fileContents: Buffer.from("image"),
|
||||
},
|
||||
async ({ ctx, media, cache }) => {
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
agentDir: "/tmp",
|
||||
providerRegistry: buildProviderRegistry(undefined, cfg),
|
||||
});
|
||||
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
expect(seenProviders).toEqual(["minimax-cn"]);
|
||||
expect(requireCapabilityOutput(result, 0)).toEqual({
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
provider: "minimax-cn",
|
||||
model: "MiniMax-VL-01",
|
||||
text: "cn vlm ok",
|
||||
});
|
||||
},
|
||||
);
|
||||
} finally {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
vi.unstubAllEnvs();
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps MiniMax auto routing on VLM when registry lacks a default model", async () => {
|
||||
let seenModel: string | undefined;
|
||||
await withMediaFixture(
|
||||
{
|
||||
filePrefix: "openclaw-minimax-vlm-default",
|
||||
extension: "png",
|
||||
mediaType: "image/png",
|
||||
fileContents: Buffer.from("image"),
|
||||
},
|
||||
async ({ ctx, media, cache }) => {
|
||||
const cfg = {
|
||||
models: {
|
||||
providers: {
|
||||
minimax: {
|
||||
apiKey: "test-minimax-key",
|
||||
baseUrl: "https://api.minimax.io/anthropic",
|
||||
models: [
|
||||
{
|
||||
id: "MiniMax-M2.5",
|
||||
name: "MiniMax M2.5",
|
||||
reasoning: false,
|
||||
input: ["text", "image"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 128_000,
|
||||
maxTokens: 8_192,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
agentDir: "/tmp",
|
||||
providerRegistry: new Map([
|
||||
[
|
||||
"minimax",
|
||||
{
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
describeImage: async (req) => {
|
||||
seenModel = req.model;
|
||||
return { text: "vlm ok", model: req.model };
|
||||
},
|
||||
},
|
||||
],
|
||||
]),
|
||||
});
|
||||
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
expect(seenModel).toBe("MiniMax-VL-01");
|
||||
expect(requireCapabilityOutput(result, 0)).toMatchObject({
|
||||
provider: "minimax",
|
||||
model: "MiniMax-VL-01",
|
||||
text: "vlm ok",
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps non-MiniMax media aliases canonical for image execution", async () => {
|
||||
const seenProviders: string[] = [];
|
||||
const cfg = {
|
||||
tools: {
|
||||
media: {
|
||||
image: {
|
||||
models: [{ provider: "gemini", model: "gemini-3-flash-preview" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
const providerRegistry = new Map<string, MediaUnderstandingProvider>([
|
||||
[
|
||||
"google",
|
||||
{
|
||||
id: "google",
|
||||
capabilities: ["image" as const],
|
||||
describeImage: async (req) => {
|
||||
seenProviders.push(req.provider);
|
||||
return { text: "google ok", model: req.model };
|
||||
},
|
||||
},
|
||||
],
|
||||
]);
|
||||
|
||||
await withMediaFixture(
|
||||
{
|
||||
filePrefix: "openclaw-gemini-media-alias",
|
||||
extension: "png",
|
||||
mediaType: "image/png",
|
||||
fileContents: Buffer.from("image"),
|
||||
},
|
||||
async ({ ctx, media, cache }) => {
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
agentDir: "/tmp",
|
||||
providerRegistry,
|
||||
});
|
||||
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
expect(seenProviders).toEqual(["google"]);
|
||||
expect(requireCapabilityOutput(result, 0)).toEqual({
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
provider: "google",
|
||||
model: "gemini-3-flash-preview",
|
||||
text: "google ok",
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("canonicalizes non-MiniMax active media aliases for auto image resolution", async () => {
|
||||
vi.stubEnv("GEMINI_API_KEY", "test-gemini-key");
|
||||
const cfg = {} as OpenClawConfig;
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "google",
|
||||
pluginName: "Google Provider",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "google",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "gemini-3-flash-preview" },
|
||||
describeImage: async () => ({ text: "ok" }),
|
||||
},
|
||||
});
|
||||
setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
|
||||
|
||||
try {
|
||||
await expect(
|
||||
resolveAutoImageModel({
|
||||
cfg,
|
||||
activeModel: { provider: "gemini", model: "gemini-3-flash-preview" },
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
provider: "google",
|
||||
model: "gemini-3-flash-preview",
|
||||
});
|
||||
} finally {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
vi.unstubAllEnvs();
|
||||
}
|
||||
});
|
||||
|
||||
it("uses active OpenRouter image models for auto image resolution", async () => {
|
||||
vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
|
||||
const cfg = {} as OpenClawConfig;
|
||||
|
||||
@@ -67,6 +67,10 @@ describe("media-understanding runtime", () => {
|
||||
afterEach(() => {
|
||||
mocks.buildProviderRegistry.mockReset();
|
||||
mocks.createMediaAttachmentCache.mockReset();
|
||||
mocks.createMediaAttachmentCache.mockReturnValue({
|
||||
cleanup: mocks.cleanup,
|
||||
getBuffer: mocks.getBuffer,
|
||||
});
|
||||
mocks.normalizeMediaAttachments.mockReset();
|
||||
mocks.normalizeMediaProviderId.mockReset();
|
||||
mocks.buildMediaUnderstandingRegistry.mockReset();
|
||||
@@ -186,6 +190,76 @@ describe("media-understanding runtime", () => {
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("classifies extensionless remote image URLs before capability filtering", async () => {
|
||||
const output: MediaUnderstandingOutput = {
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
provider: "vision-plugin",
|
||||
model: "vision-v1",
|
||||
text: "image ok",
|
||||
};
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, url: "https://httpbin.org/image/png", mime: "image/*" },
|
||||
]);
|
||||
mocks.runCapability.mockResolvedValue({
|
||||
outputs: [output],
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageFile({
|
||||
filePath: "https://httpbin.org/image/png",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
text: "image ok",
|
||||
provider: "vision-plugin",
|
||||
model: "vision-v1",
|
||||
output,
|
||||
});
|
||||
|
||||
expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
|
||||
MediaUrl: "https://httpbin.org/image/png",
|
||||
MediaType: "image/*",
|
||||
});
|
||||
expect(requireRunCapabilityRequest()).toMatchObject({
|
||||
ctx: {
|
||||
MediaUrl: "https://httpbin.org/image/png",
|
||||
MediaType: "image/*",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("does not force typed remote URLs into the requested capability", async () => {
|
||||
const media = [{ index: 0, url: "https://example.com/clip.mp4", mime: "video/mp4" }];
|
||||
mocks.normalizeMediaAttachments.mockReturnValue(media);
|
||||
mocks.runCapability.mockResolvedValue({
|
||||
outputs: [],
|
||||
decision: { capability: "image", outcome: "skipped", attachments: [] },
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageFile({
|
||||
filePath: "https://example.com/clip.mp4",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
text: undefined,
|
||||
output: undefined,
|
||||
});
|
||||
|
||||
expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
|
||||
MediaUrl: "https://example.com/clip.mp4",
|
||||
MediaType: "video/mp4",
|
||||
});
|
||||
expect(requireRunCapabilityRequest()).toMatchObject({
|
||||
capability: "image",
|
||||
ctx: { MediaUrl: "https://example.com/clip.mp4", MediaType: "video/mp4" },
|
||||
media,
|
||||
});
|
||||
});
|
||||
|
||||
it("passes workspaceDir through file media understanding requests", async () => {
|
||||
const output: MediaUnderstandingOutput = {
|
||||
kind: "image.description",
|
||||
@@ -395,6 +469,7 @@ describe("media-understanding runtime", () => {
|
||||
await describeImageFileWithModel({
|
||||
filePath: "https://example.com/photo.png",
|
||||
mediaUrl: "https://example.com/photo.png",
|
||||
mime: "image/*",
|
||||
provider: "zai",
|
||||
model: "glm-4.6v",
|
||||
prompt: "Describe it",
|
||||
@@ -412,6 +487,58 @@ describe("media-understanding runtime", () => {
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("fetches remote explicit image descriptions through the media attachment cache", async () => {
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, url: "https://httpbin.org/image/png", mime: "image/png" },
|
||||
]);
|
||||
mocks.buildProviderRegistry.mockReturnValue(
|
||||
new Map([["zai", { id: "zai", capabilities: ["image"] }]]),
|
||||
);
|
||||
mocks.getBuffer.mockResolvedValue({
|
||||
buffer: Buffer.from("remote-png"),
|
||||
fileName: "png",
|
||||
mime: "image/png",
|
||||
size: 10,
|
||||
});
|
||||
|
||||
await expect(
|
||||
describeImageFileWithModel({
|
||||
filePath: "https://httpbin.org/image/png",
|
||||
provider: "zai",
|
||||
model: "glm-4.6v",
|
||||
prompt: "Describe it",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
timeoutMs: 45_000,
|
||||
}),
|
||||
).resolves.toEqual({ text: "generic image ok", model: "vision" });
|
||||
|
||||
expect(mocks.readLocalFileSafely).not.toHaveBeenCalled();
|
||||
expect(mocks.normalizeMediaAttachments).toHaveBeenCalledWith({
|
||||
MediaUrl: "https://httpbin.org/image/png",
|
||||
MediaType: "image/*",
|
||||
});
|
||||
expect(mocks.createMediaAttachmentCache).toHaveBeenCalledWith(
|
||||
[{ index: 0, url: "https://httpbin.org/image/png", mime: "image/png" }],
|
||||
{ ssrfPolicy: undefined },
|
||||
);
|
||||
expect(mocks.getBuffer).toHaveBeenCalledWith({
|
||||
attachmentIndex: 0,
|
||||
maxBytes: 10 * 1024 * 1024,
|
||||
timeoutMs: 45_000,
|
||||
});
|
||||
expect(mocks.describeImageWithModel).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
buffer: Buffer.from("remote-png"),
|
||||
fileName: "png",
|
||||
mime: "image/png",
|
||||
provider: "zai",
|
||||
model: "glm-4.6v",
|
||||
}),
|
||||
);
|
||||
expect(mocks.cleanup).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("routes direct image description through a provider-specific image hook", async () => {
|
||||
const describeImage = vi.fn(async () => ({
|
||||
text: "image ok",
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import path from "node:path";
|
||||
import type { OpenClawConfig } from "../config/types.js";
|
||||
import { readLocalFileSafely } from "../infra/fs-safe.js";
|
||||
import { kindFromMime, mimeTypeFromFilePath } from "../media/mime.js";
|
||||
import { DEFAULT_MAX_BYTES } from "./defaults.constants.js";
|
||||
import { describeImageWithModel } from "./image-runtime.js";
|
||||
import {
|
||||
@@ -48,13 +50,61 @@ function resolveDecisionFailureReason(
|
||||
return normalizeDecisionReason(findDecisionReason(decision, "failed"));
|
||||
}
|
||||
|
||||
function buildFileContext(params: { filePath: string; mediaUrl?: string; mime?: string }) {
|
||||
function buildFileContext(params: {
|
||||
filePath: string;
|
||||
mediaUrl?: string;
|
||||
mime?: string;
|
||||
capability?: MediaUnderstandingCapability;
|
||||
}) {
|
||||
const remoteRef =
|
||||
params.mediaUrl ??
|
||||
(isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
|
||||
const extensionMime = remoteRef ? mimeTypeFromFilePath(remoteRef) : undefined;
|
||||
const extensionKind = kindFromMime(extensionMime);
|
||||
const mediaType =
|
||||
params.mime ??
|
||||
(remoteRef && params.capability && extensionKind === params.capability
|
||||
? `${params.capability}/*`
|
||||
: extensionMime) ??
|
||||
(remoteRef && params.capability ? `${params.capability}/*` : undefined);
|
||||
if (remoteRef) {
|
||||
return {
|
||||
MediaUrl: remoteRef,
|
||||
MediaType: mediaType,
|
||||
};
|
||||
}
|
||||
return {
|
||||
...(params.mediaUrl ? { MediaUrl: params.mediaUrl } : { MediaPath: params.filePath }),
|
||||
MediaType: params.mime,
|
||||
MediaPath: params.filePath,
|
||||
MediaType: mediaType,
|
||||
};
|
||||
}
|
||||
|
||||
function isRemoteMediaReference(value: string): boolean {
|
||||
return /^https?:\/\//i.test(value.trim());
|
||||
}
|
||||
|
||||
function concreteMime(mime: string | undefined): string | undefined {
|
||||
const normalized = mime?.trim();
|
||||
if (!normalized || normalized.endsWith("/*")) {
|
||||
return undefined;
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function resolveFileLocalRoots(filePath: string): string[] | undefined {
|
||||
return isRemoteMediaReference(filePath) ? undefined : [path.dirname(filePath)];
|
||||
}
|
||||
|
||||
function basenameFromMediaReference(value: string): string {
|
||||
if (isRemoteMediaReference(value)) {
|
||||
try {
|
||||
const url = new URL(value);
|
||||
return path.basename(url.pathname) || "image";
|
||||
} catch {}
|
||||
}
|
||||
return path.basename(value);
|
||||
}
|
||||
|
||||
function hasStructuredImageInput(input: ExtractStructuredWithModelParams["input"]): boolean {
|
||||
return input.some((entry) => entry.type === "image");
|
||||
}
|
||||
@@ -93,7 +143,7 @@ export async function runMediaUnderstandingFile(
|
||||
},
|
||||
}
|
||||
: params.cfg;
|
||||
const ctx = buildFileContext(params);
|
||||
const ctx = buildFileContext({ ...params, capability: params.capability });
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
if (attachments.length === 0) {
|
||||
return {
|
||||
@@ -114,7 +164,7 @@ export async function runMediaUnderstandingFile(
|
||||
|
||||
const providerRegistry = buildProviderRegistry(undefined, cfg);
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: [path.dirname(params.filePath)],
|
||||
localPathRoots: params.mediaUrl ? undefined : resolveFileLocalRoots(params.filePath),
|
||||
ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
});
|
||||
|
||||
@@ -166,33 +216,18 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
|
||||
const timeoutMs = params.timeoutMs ?? 30_000;
|
||||
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
|
||||
const provider = providerRegistry.get(normalizeMediaProviderId(params.provider));
|
||||
let buffer: Buffer;
|
||||
let fileName = path.basename(params.filePath);
|
||||
let mime = params.mime;
|
||||
if (params.mediaUrl) {
|
||||
const cache = createMediaAttachmentCache(normalizeMediaAttachments(buildFileContext(params)), {
|
||||
ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
});
|
||||
try {
|
||||
const media = await cache.getBuffer({
|
||||
attachmentIndex: 0,
|
||||
maxBytes: DEFAULT_MAX_BYTES.image,
|
||||
timeoutMs,
|
||||
});
|
||||
buffer = media.buffer;
|
||||
fileName = media.fileName;
|
||||
mime = media.mime;
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
} else {
|
||||
buffer = (await readLocalFileSafely({ filePath: params.filePath })).buffer;
|
||||
}
|
||||
const image = await readImageDescriptionInput({
|
||||
filePath: params.filePath,
|
||||
mediaUrl: params.mediaUrl,
|
||||
mime: params.mime,
|
||||
cfg: params.cfg,
|
||||
timeoutMs,
|
||||
});
|
||||
const describeImage = provider?.describeImage ?? describeImageWithModel;
|
||||
return await describeImage({
|
||||
buffer,
|
||||
fileName,
|
||||
mime,
|
||||
buffer: image.buffer,
|
||||
fileName: image.fileName,
|
||||
mime: image.mime,
|
||||
provider: params.provider,
|
||||
model: params.model,
|
||||
prompt: params.prompt,
|
||||
@@ -204,6 +239,45 @@ export async function describeImageFileWithModel(params: DescribeImageFileWithMo
|
||||
});
|
||||
}
|
||||
|
||||
async function readImageDescriptionInput(params: {
|
||||
filePath: string;
|
||||
mediaUrl?: string;
|
||||
mime?: string;
|
||||
cfg: OpenClawConfig;
|
||||
timeoutMs: number;
|
||||
}): Promise<{ buffer: Buffer; fileName: string; mime?: string }> {
|
||||
const remoteRef =
|
||||
params.mediaUrl ??
|
||||
(isRemoteMediaReference(params.filePath) ? params.filePath.trim() : undefined);
|
||||
if (!remoteRef) {
|
||||
return {
|
||||
buffer: (await readLocalFileSafely({ filePath: params.filePath })).buffer,
|
||||
fileName: basenameFromMediaReference(params.filePath),
|
||||
mime: params.mime,
|
||||
};
|
||||
}
|
||||
const attachments = normalizeMediaAttachments(
|
||||
buildFileContext({ ...params, capability: "image" }),
|
||||
);
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
});
|
||||
try {
|
||||
const media = await cache.getBuffer({
|
||||
attachmentIndex: 0,
|
||||
maxBytes: DEFAULT_MAX_BYTES.image,
|
||||
timeoutMs: params.timeoutMs,
|
||||
});
|
||||
return {
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName || basenameFromMediaReference(remoteRef),
|
||||
mime: concreteMime(params.mime) ?? media.mime,
|
||||
};
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractStructuredWithModel(params: ExtractStructuredWithModelParams) {
|
||||
const timeoutMs = params.timeoutMs ?? 30_000;
|
||||
if (!hasStructuredImageInput(params.input)) {
|
||||
|
||||
Reference in New Issue
Block a user