From 22e8d7b469ef8a2fea57c312541659bf1b2c3102 Mon Sep 17 00:00:00 2001 From: Shakker Date: Sat, 2 May 2026 04:14:22 +0100 Subject: [PATCH] fix: defer image tool auto discovery --- src/agents/openclaw-tools.ts | 1 + src/agents/tools/image-tool.test.ts | 32 +++++++++++++++++++ src/agents/tools/image-tool.ts | 48 +++++++++++++++++++++++------ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts index e8573853a70..fb795207393 100644 --- a/src/agents/openclaw-tools.ts +++ b/src/agents/openclaw-tools.ts @@ -325,6 +325,7 @@ export function createOpenClawTools( sandbox, fsPolicy: options?.fsPolicy, modelHasVision: options?.modelHasVision, + deferAutoModelResolution: true, }) : null; options?.recordToolPrepStage?.("openclaw-tools:image-tool"); diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 4a8fddb1a91..93f5db0a22a 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -628,6 +628,38 @@ describe("image tool implicit imageModel config", () => { }); }); + it("defers implicit image model discovery during hot-path tool registration", async () => { + await withTempAgentDir(async (agentDir) => { + const resolveDefaultMediaModelSpy = vi.fn(() => "gpt-5.4-mini"); + const resolveAutoMediaKeyProvidersSpy = vi.fn(() => ["openai"]); + __testing.setProviderDepsForTest({ + buildProviderRegistry: (overrides?: Record) => + imageProviderHarness.buildProviderRegistry(overrides), + getMediaUnderstandingProvider: ( + id: string, + registry: Map, + ) => imageProviderHarness.getMediaUnderstandingProvider(id, registry), + describeImageWithModel: describeGenericImageWithModel, + describeImagesWithModel: describeGenericImagesWithModel, + resolveDefaultMediaModel: resolveDefaultMediaModelSpy, + resolveAutoMediaKeyProviders: resolveAutoMediaKeyProvidersSpy, + }); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "openai/gpt-5.4" } } }, + }; + + const tool = createImageTool({ + config: cfg, + agentDir, + deferAutoModelResolution: true, + }); + + expect(tool).not.toBeNull(); + expect(resolveDefaultMediaModelSpy).not.toHaveBeenCalled(); + expect(resolveAutoMediaKeyProvidersSpy).not.toHaveBeenCalled(); + }); + }); + it("pairs minimax primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => { await withTempAgentDir(async (agentDir) => { vi.stubEnv("MINIMAX_API_KEY", "minimax-test"); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 2f459701ffa..31653e2e310 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -379,22 +379,37 @@ export function createImageTool(options?: { fsPolicy?: ToolFsPolicy; /** If true, the model has native vision capability and images in the prompt are auto-injected */ modelHasVision?: boolean; + /** + * Avoid resolving auto image-provider/model candidates while registering the + * tool. The concrete image model is still resolved before execution. + */ + deferAutoModelResolution?: boolean; }): AnyAgentTool | null { const agentDir = options?.agentDir?.trim(); + const explicit = coerceImageModelConfig(options?.config); if (!agentDir) { - const explicit = coerceImageModelConfig(options?.config); if (hasToolModelConfig(explicit)) { throw new Error("createImageTool requires agentDir when enabled"); } return null; } - const imageModelConfig = resolveImageModelConfigForTool({ - cfg: options?.config, - agentDir, - workspaceDir: options?.workspaceDir, - authStore: options?.authProfileStore, - }); - if (!imageModelConfig) { + const explicitImageModelConfig = hasToolModelConfig(explicit) + ? resolveConfiguredImageModelRefs({ + cfg: options?.config, + imageModelConfig: explicit, + }) + : null; + const shouldResolveAutoImageModel = + !explicitImageModelConfig && !options?.deferAutoModelResolution; + const resolvedImageModelConfig = shouldResolveAutoImageModel + ? resolveImageModelConfigForTool({ + cfg: options?.config, + agentDir, + workspaceDir: options?.workspaceDir, + authStore: options?.authProfileStore, + }) + : explicitImageModelConfig; + if (!resolvedImageModelConfig && !options?.deferAutoModelResolution) { return null; } const remoteMediaSsrfPolicy = resolveRemoteMediaSsrfPolicy(options?.config); @@ -403,7 +418,9 @@ export function createImageTool(options?: { // so this tool is only needed when image wasn't provided in the prompt const description = options?.modelHasVision ? "Analyze one or more images with a vision model. Use image for a single path/URL, or images for multiple (up to 20). Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." - : "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze."; + : explicitImageModelConfig + ? "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze." + : "Analyze one or more images with an available vision model. Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze."; return { label: "Image", @@ -603,6 +620,19 @@ export function createImageTool(options?: { } // MARK: - Run image prompt with all loaded images + const imageModelConfig = + resolvedImageModelConfig ?? + resolveImageModelConfigForTool({ + cfg: options?.config, + agentDir, + workspaceDir: options?.workspaceDir, + authStore: options?.authProfileStore, + }); + if (!imageModelConfig) { + throw new Error( + "No image model is configured. Set agents.defaults.imageModel or configure an image-capable provider.", + ); + } const result = await runImagePrompt({ cfg: options?.config, agentDir,