From ff4f59ec9062bfa675257f8fd5ef30df1f5e21db Mon Sep 17 00:00:00 2001 From: Tyler Yust <64381258+tyler6204@users.noreply.github.com> Date: Sun, 15 Feb 2026 13:45:17 -0800 Subject: [PATCH] feat(image-tool): support multiple images in a single tool call (#17512) * feat(image-tool): support multiple images in a single tool call - Change 'image' parameter to accept string | string[] (Type.Union) - Add 'maxImages' parameter (default 5) to cap abuse/token explosion - Update buildImageContext to create multiple image content parts - Normalize single string input to array for unified processing - Keep full backward compatibility: single string works as before - Update tool descriptions for both vision and non-vision models - MiniMax VLM falls back to first image (single-image API) - Details output adapts: 'image' key for single, 'images' for multi * bump default max images from 5 to 20 --- src/agents/tools/image-tool.e2e.test.ts | 4 +- src/agents/tools/image-tool.ts | 242 ++++++++++++++++-------- 2 files changed, 159 insertions(+), 87 deletions(-) diff --git a/src/agents/tools/image-tool.e2e.test.ts b/src/agents/tools/image-tool.e2e.test.ts index d5daf9d5de7..2b58753777c 100644 --- a/src/agents/tools/image-tool.e2e.test.ts +++ b/src/agents/tools/image-tool.e2e.test.ts @@ -192,9 +192,7 @@ describe("image tool implicit imageModel config", () => { }); const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true }); expect(tool).not.toBeNull(); - expect(tool?.description).toContain( - "Only use this tool when the image was NOT already provided", - ); + expect(tool?.description).toContain("Only use this tool when images were NOT already provided"); }); it("allows workspace images outside default local media roots", async () => { diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 896b7447138..3d63623b778 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -26,6 +26,7 @@ import { const DEFAULT_PROMPT = "Describe the image."; const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6"; const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5"; +const DEFAULT_MAX_IMAGES = 20; export const __testing = { decodeDataUrl, @@ -182,15 +183,21 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef return undefined; } -function buildImageContext(prompt: string, base64: string, mimeType: string): Context { +function buildImageContext( + prompt: string, + images: Array<{ base64: string; mimeType: string }>, +): Context { + const content: Array< + { type: "text"; text: string } | { type: "image"; data: string; mimeType: string } + > = [{ type: "text", text: prompt }]; + for (const img of images) { + content.push({ type: "image", data: img.base64, mimeType: img.mimeType }); + } return { messages: [ { role: "user", - content: [ - { type: "text", text: prompt }, - { type: "image", data: base64, mimeType }, - ], + content, timestamp: Date.now(), }, ], @@ -242,8 +249,7 @@ async function runImagePrompt(params: { imageModelConfig: ImageModelConfig; modelOverride?: string; prompt: string; - base64: string; - mimeType: string; + images: Array<{ base64: string; mimeType: string }>; }): Promise<{ text: string; provider: string; @@ -285,9 +291,11 @@ async function runImagePrompt(params: { }); const apiKey = requireApiKey(apiKeyInfo, model.provider); authStorage.setRuntimeApiKey(model.provider, apiKey); - const imageDataUrl = `data:${params.mimeType};base64,${params.base64}`; + // MiniMax VLM only supports a single image; use the first one. if (model.provider === "minimax") { + const first = params.images[0]; + const imageDataUrl = `data:${first.mimeType};base64,${first.base64}`; const text = await minimaxUnderstandImage({ apiKey, prompt: params.prompt, @@ -297,7 +305,7 @@ async function runImagePrompt(params: { return { text, provider: model.provider, model: model.id }; } - const context = buildImageContext(params.prompt, params.base64, params.mimeType); + const context = buildImageContext(params.prompt, params.images); const message = await complete(model, context, { apiKey, maxTokens: resolveImageToolMaxTokens(model.maxTokens), @@ -350,8 +358,8 @@ export function createImageTool(options?: { // If model has native vision, images in the prompt are auto-injected // so this tool is only needed when image wasn't provided in the prompt const description = options?.modelHasVision - ? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." - : "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL."; + ? "Analyze one or more images with a vision model. Pass a single image path/URL or an array of up to 20. Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." + : "Analyze one or more images with the configured image model (agents.defaults.imageModel). Pass a single image path/URL or an array of up to 20. Provide a prompt describing what to analyze."; const localRoots = (() => { const roots = getDefaultLocalRoots(); @@ -368,44 +376,47 @@ export function createImageTool(options?: { description, parameters: Type.Object({ prompt: Type.Optional(Type.String()), - image: Type.String(), + image: Type.Union([Type.String(), Type.Array(Type.String())]), model: Type.Optional(Type.String()), maxBytesMb: Type.Optional(Type.Number()), + maxImages: Type.Optional(Type.Number()), }), execute: async (_toolCallId, args) => { const record = args && typeof args === "object" ? (args as Record) : {}; - const imageRawInput = typeof record.image === "string" ? record.image.trim() : ""; - const imageRaw = imageRawInput.startsWith("@") - ? imageRawInput.slice(1).trim() - : imageRawInput; - if (!imageRaw) { + + // MARK: - Normalize image input (string | string[]) + const rawImageInput = record.image; + const imageInputs: string[] = (() => { + if (typeof rawImageInput === "string") { + return [rawImageInput]; + } + if (Array.isArray(rawImageInput)) { + return rawImageInput.filter((v): v is string => typeof v === "string"); + } + return []; + })(); + if (imageInputs.length === 0) { throw new Error("image required"); } - // The tool accepts file paths, file/data URLs, or http(s) URLs. In some - // agent/model contexts, images can be referenced as pseudo-URIs like - // `image:0` (e.g. "first image in the prompt"). We don't have access to a - // shared image registry here, so fail gracefully instead of attempting to - // `fs.readFile("image:0")` and producing a noisy ENOENT. - const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw); - const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw); - const isFileUrl = /^file:/i.test(imageRaw); - const isHttpUrl = /^https?:\/\//i.test(imageRaw); - const isDataUrl = /^data:/i.test(imageRaw); - if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) { + // MARK: - Enforce max images cap + const maxImagesRaw = typeof record.maxImages === "number" ? record.maxImages : undefined; + const maxImages = + typeof maxImagesRaw === "number" && Number.isFinite(maxImagesRaw) && maxImagesRaw > 0 + ? Math.floor(maxImagesRaw) + : DEFAULT_MAX_IMAGES; + if (imageInputs.length > maxImages) { return { content: [ { type: "text", - text: `Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`, + text: `Too many images: ${imageInputs.length} provided, maximum is ${maxImages}. Please reduce the number of images.`, }, ], - details: { - error: "unsupported_image_reference", - image: imageRawInput, - }, + details: { error: "too_many_images", count: imageInputs.length, max: maxImages }, }; } + const promptRaw = typeof record.prompt === "string" && record.prompt.trim() ? record.prompt.trim() @@ -419,73 +430,136 @@ export function createImageTool(options?: { options?.sandbox && options?.sandbox.root.trim() ? { root: options.sandbox.root.trim(), bridge: options.sandbox.bridge } : null; - const isUrl = isHttpUrl; - if (sandboxConfig && isUrl) { - throw new Error("Sandboxed image tool does not allow remote URLs."); - } - const resolvedImage = (() => { - if (sandboxConfig) { + // MARK: - Load and resolve each image + const loadedImages: Array<{ + base64: string; + mimeType: string; + resolvedImage: string; + rewrittenFrom?: string; + }> = []; + + for (const imageRawInput of imageInputs) { + const trimmed = imageRawInput.trim(); + const imageRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed; + if (!imageRaw) { + throw new Error("image required (empty string in array)"); + } + + // The tool accepts file paths, file/data URLs, or http(s) URLs. In some + // agent/model contexts, images can be referenced as pseudo-URIs like + // `image:0` (e.g. "first image in the prompt"). We don't have access to a + // shared image registry here, so fail gracefully instead of attempting to + // `fs.readFile("image:0")` and producing a noisy ENOENT. + const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw); + const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw); + const isFileUrl = /^file:/i.test(imageRaw); + const isHttpUrl = /^https?:\/\//i.test(imageRaw); + const isDataUrl = /^data:/i.test(imageRaw); + if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) { + return { + content: [ + { + type: "text", + text: `Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`, + }, + ], + details: { + error: "unsupported_image_reference", + image: imageRawInput, + }, + }; + } + + if (sandboxConfig && isHttpUrl) { + throw new Error("Sandboxed image tool does not allow remote URLs."); + } + + const resolvedImage = (() => { + if (sandboxConfig) { + return imageRaw; + } + if (imageRaw.startsWith("~")) { + return resolveUserPath(imageRaw); + } return imageRaw; - } - if (imageRaw.startsWith("~")) { - return resolveUserPath(imageRaw); - } - return imageRaw; - })(); - const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl - ? { resolved: "" } - : sandboxConfig - ? await resolveSandboxedImagePath({ - sandbox: sandboxConfig, - imagePath: resolvedImage, - }) - : { - resolved: resolvedImage.startsWith("file://") - ? resolvedImage.slice("file://".length) - : resolvedImage, - }; - const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved; + })(); + const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl + ? { resolved: "" } + : sandboxConfig + ? await resolveSandboxedImagePath({ + sandbox: sandboxConfig, + imagePath: resolvedImage, + }) + : { + resolved: resolvedImage.startsWith("file://") + ? resolvedImage.slice("file://".length) + : resolvedImage, + }; + const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved; - const media = isDataUrl - ? decodeDataUrl(resolvedImage) - : sandboxConfig - ? await loadWebMedia(resolvedPath ?? resolvedImage, { - maxBytes, - sandboxValidated: true, - readFile: (filePath) => - sandboxConfig.bridge.readFile({ filePath, cwd: sandboxConfig.root }), - }) - : await loadWebMedia(resolvedPath ?? resolvedImage, { - maxBytes, - localRoots, - }); - if (media.kind !== "image") { - throw new Error(`Unsupported media type: ${media.kind}`); + const media = isDataUrl + ? decodeDataUrl(resolvedImage) + : sandboxConfig + ? await loadWebMedia(resolvedPath ?? resolvedImage, { + maxBytes, + sandboxValidated: true, + readFile: (filePath) => + sandboxConfig.bridge.readFile({ filePath, cwd: sandboxConfig.root }), + }) + : await loadWebMedia(resolvedPath ?? resolvedImage, { + maxBytes, + localRoots, + }); + if (media.kind !== "image") { + throw new Error(`Unsupported media type: ${media.kind}`); + } + + const mimeType = + ("contentType" in media && media.contentType) || + ("mimeType" in media && media.mimeType) || + "image/png"; + const base64 = media.buffer.toString("base64"); + loadedImages.push({ + base64, + mimeType, + resolvedImage, + ...(resolvedPathInfo.rewrittenFrom + ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } + : {}), + }); } - const mimeType = - ("contentType" in media && media.contentType) || - ("mimeType" in media && media.mimeType) || - "image/png"; - const base64 = media.buffer.toString("base64"); + // MARK: - Run image prompt with all loaded images const result = await runImagePrompt({ cfg: options?.config, agentDir, imageModelConfig, modelOverride, prompt: promptRaw, - base64, - mimeType, + images: loadedImages.map((img) => ({ base64: img.base64, mimeType: img.mimeType })), }); + + const imageDetails = + loadedImages.length === 1 + ? { + image: loadedImages[0].resolvedImage, + ...(loadedImages[0].rewrittenFrom + ? { rewrittenFrom: loadedImages[0].rewrittenFrom } + : {}), + } + : { + images: loadedImages.map((img) => ({ + image: img.resolvedImage, + ...(img.rewrittenFrom ? { rewrittenFrom: img.rewrittenFrom } : {}), + })), + }; + return { content: [{ type: "text", text: result.text }], details: { model: `${result.provider}/${result.model}`, - image: resolvedImage, - ...(resolvedPathInfo.rewrittenFrom - ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } - : {}), + ...imageDetails, attempts: result.attempts, }, };