fix: defer image tool auto discovery

This commit is contained in:
Shakker
2026-05-02 04:14:22 +01:00
parent d94889909c
commit 22e8d7b469
3 changed files with 72 additions and 9 deletions

View File

@@ -325,6 +325,7 @@ export function createOpenClawTools(
sandbox,
fsPolicy: options?.fsPolicy,
modelHasVision: options?.modelHasVision,
deferAutoModelResolution: true,
})
: null;
options?.recordToolPrepStage?.("openclaw-tools:image-tool");

View File

@@ -628,6 +628,38 @@ describe("image tool implicit imageModel config", () => {
});
});
it("defers implicit image model discovery during hot-path tool registration", async () => {
await withTempAgentDir(async (agentDir) => {
const resolveDefaultMediaModelSpy = vi.fn(() => "gpt-5.4-mini");
const resolveAutoMediaKeyProvidersSpy = vi.fn(() => ["openai"]);
__testing.setProviderDepsForTest({
buildProviderRegistry: (overrides?: Record<string, MediaUnderstandingProvider>) =>
imageProviderHarness.buildProviderRegistry(overrides),
getMediaUnderstandingProvider: (
id: string,
registry: Map<string, MediaUnderstandingProvider>,
) => imageProviderHarness.getMediaUnderstandingProvider(id, registry),
describeImageWithModel: describeGenericImageWithModel,
describeImagesWithModel: describeGenericImagesWithModel,
resolveDefaultMediaModel: resolveDefaultMediaModelSpy,
resolveAutoMediaKeyProviders: resolveAutoMediaKeyProvidersSpy,
});
const cfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-5.4" } } },
};
const tool = createImageTool({
config: cfg,
agentDir,
deferAutoModelResolution: true,
});
expect(tool).not.toBeNull();
expect(resolveDefaultMediaModelSpy).not.toHaveBeenCalled();
expect(resolveAutoMediaKeyProvidersSpy).not.toHaveBeenCalled();
});
});
it("pairs minimax primary with MiniMax-VL-01 (and fallbacks) when auth exists", async () => {
await withTempAgentDir(async (agentDir) => {
vi.stubEnv("MINIMAX_API_KEY", "minimax-test");

View File

@@ -379,22 +379,37 @@ export function createImageTool(options?: {
fsPolicy?: ToolFsPolicy;
/** If true, the model has native vision capability and images in the prompt are auto-injected */
modelHasVision?: boolean;
/**
* Avoid resolving auto image-provider/model candidates while registering the
* tool. The concrete image model is still resolved before execution.
*/
deferAutoModelResolution?: boolean;
}): AnyAgentTool | null {
const agentDir = options?.agentDir?.trim();
const explicit = coerceImageModelConfig(options?.config);
if (!agentDir) {
const explicit = coerceImageModelConfig(options?.config);
if (hasToolModelConfig(explicit)) {
throw new Error("createImageTool requires agentDir when enabled");
}
return null;
}
const imageModelConfig = resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
workspaceDir: options?.workspaceDir,
authStore: options?.authProfileStore,
});
if (!imageModelConfig) {
const explicitImageModelConfig = hasToolModelConfig(explicit)
? resolveConfiguredImageModelRefs({
cfg: options?.config,
imageModelConfig: explicit,
})
: null;
const shouldResolveAutoImageModel =
!explicitImageModelConfig && !options?.deferAutoModelResolution;
const resolvedImageModelConfig = shouldResolveAutoImageModel
? resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
workspaceDir: options?.workspaceDir,
authStore: options?.authProfileStore,
})
: explicitImageModelConfig;
if (!resolvedImageModelConfig && !options?.deferAutoModelResolution) {
return null;
}
const remoteMediaSsrfPolicy = resolveRemoteMediaSsrfPolicy(options?.config);
@@ -403,7 +418,9 @@ export function createImageTool(options?: {
// so this tool is only needed when image wasn't provided in the prompt
const description = options?.modelHasVision
? "Analyze one or more images with a vision model. Use image for a single path/URL, or images for multiple (up to 20). Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
: "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze.";
: explicitImageModelConfig
? "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze."
: "Analyze one or more images with an available vision model. Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze.";
return {
label: "Image",
@@ -603,6 +620,19 @@ export function createImageTool(options?: {
}
// MARK: - Run image prompt with all loaded images
const imageModelConfig =
resolvedImageModelConfig ??
resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
workspaceDir: options?.workspaceDir,
authStore: options?.authProfileStore,
});
if (!imageModelConfig) {
throw new Error(
"No image model is configured. Set agents.defaults.imageModel or configure an image-capable provider.",
);
}
const result = await runImagePrompt({
cfg: options?.config,
agentDir,