From 391796a3fb219aef54b13a02f15d36b18d2a532c Mon Sep 17 00:00:00 2001 From: Sebastian <19554889+sebslight@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:34:27 -0500 Subject: [PATCH] fix(agents): restore multi-image image tool schema contract --- CHANGELOG.md | 1 + src/agents/tools/image-tool.e2e.test.ts | 135 +++++++++++++++++++++++- src/agents/tools/image-tool.ts | 48 ++++++--- 3 files changed, 167 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe83be615ef..69eccb8c3b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ Docs: https://docs.openclaw.ai - Agents/Tools/exec: add a preflight guard that detects likely shell env var injection (e.g. `$DM_JSON`, `$TMPDIR`) in Python/Node scripts before execution, preventing recurring cron failures and wasted tokens when models emit mixed shell+language source. (#12836) - Agents/Tools: make loop detection progress-aware and phased by hard-blocking known `process(action=poll|log)` no-progress loops, warning on generic identical-call repeats, warning + no-progress-blocking ping-pong alternation loops (10/20), coalescing repeated warning spam into threshold buckets (including canonical ping-pong pairs), adding a global circuit breaker at 30 no-progress repeats, and emitting structured diagnostic `tool.loop` warning/error events for loop actions. (#16808) Thanks @akramcodez and @beca-oc. - Agents/Tools: scope the `message` tool schema to the active channel so Telegram uses `buttons` and Discord uses `components`. (#18215) Thanks @obviyus. +- Agents/Image tool: replace Anthropic-incompatible union schema with explicit `image` (single) and `images` (multi) parameters, keeping tool schemas `anyOf`/`oneOf`/`allOf`-free while preserving multi-image analysis support. (#18551, #18566) Thanks @aldoeliacim. - Agents/Models: probe the primary model when its auth-profile cooldown is near expiry (with per-provider throttling), so runs recover from temporary rate limits without staying on fallback models until restart. (#17478) Thanks @PlayerGhost. - Agents/Failover: classify provider abort stop-reason errors (`Unhandled stop reason: abort`, `stop reason: abort`, `reason: abort`) as timeout-class failures so configured model fallback chains trigger instead of surfacing raw abort failures. (#18618) Thanks @sauerdaniel. - Models/CLI: sync auth-profiles credentials into agent `auth.json` before registry availability checks so `openclaw models list --all` reports auth correctly for API-key/token providers, normalize provider-id aliases when bridging credentials, and skip expired token mirrors. (#18610, #18615) diff --git a/src/agents/tools/image-tool.e2e.test.ts b/src/agents/tools/image-tool.e2e.test.ts index bd9d5113f7d..5245c74b4ad 100644 --- a/src/agents/tools/image-tool.e2e.test.ts +++ b/src/agents/tools/image-tool.e2e.test.ts @@ -18,6 +18,7 @@ async function writeAuthProfiles(agentDir: string, profiles: unknown) { const ONE_PIXEL_PNG_B64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; +const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs="; async function withTempWorkspacePng( cb: (args: { workspaceDir: string; imagePath: string }) => Promise, @@ -78,6 +79,25 @@ async function expectImageToolExecOk( }); } +function findSchemaUnionKeywords(schema: unknown, path = "root"): string[] { + if (!schema || typeof schema !== "object") { + return []; + } + if (Array.isArray(schema)) { + return schema.flatMap((item, index) => findSchemaUnionKeywords(item, `${path}[${index}]`)); + } + const record = schema as Record; + const out: string[] = []; + for (const [key, value] of Object.entries(record)) { + const nextPath = `${path}.${key}`; + if (key === "anyOf" || key === "oneOf" || key === "allOf") { + out.push(nextPath); + } + out.push(...findSchemaUnionKeywords(value, nextPath)); + } + return out; +} + describe("image tool implicit imageModel config", () => { const priorFetch = global.fetch; @@ -211,6 +231,66 @@ describe("image tool implicit imageModel config", () => { expect(tool?.description).toContain("Only use this tool when images were NOT already provided"); }); + it("exposes an Anthropic-safe image schema without union keywords", async () => { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-")); + try { + const cfg = createMinimaxImageConfig(); + const tool = createImageTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image tool"); + } + + const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters"); + expect(violations).toEqual([]); + + const schema = tool.parameters as { + properties?: Record; + }; + const imageSchema = schema.properties?.image as { type?: unknown } | undefined; + const imagesSchema = schema.properties?.images as + | { type?: unknown; items?: unknown } + | undefined; + const imageItems = imagesSchema?.items as { type?: unknown } | undefined; + + expect(imageSchema?.type).toBe("string"); + expect(imagesSchema?.type).toBe("array"); + expect(imageItems?.type).toBe("string"); + } finally { + await fs.rm(agentDir, { recursive: true, force: true }); + } + }); + + it("keeps an Anthropic-safe image schema snapshot", async () => { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-")); + try { + const cfg = createMinimaxImageConfig(); + const tool = createImageTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image tool"); + } + + expect(JSON.parse(JSON.stringify(tool.parameters))).toEqual({ + type: "object", + properties: { + prompt: { type: "string" }, + image: { description: "Single image path or URL.", type: "string" }, + images: { + description: "Multiple image paths or URLs (up to maxImages, default 20).", + type: "array", + items: { type: "string" }, + }, + model: { type: "string" }, + maxBytesMb: { type: "number" }, + maxImages: { type: "number" }, + }, + }); + } finally { + await fs.rm(agentDir, { recursive: true, force: true }); + } + }); + it("allows workspace images outside default local media roots", async () => { await withTempWorkspacePng(async ({ workspaceDir, imagePath }) => { const fetch = stubMinimaxOkFetch(); @@ -412,7 +492,7 @@ describe("image tool MiniMax VLM routing", () => { return { fetch, tool }; } - it("calls /v1/coding_plan/vlm for minimax image models", async () => { + it("accepts image for single-image requests and calls /v1/coding_plan/vlm", async () => { const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" }); const res = await tool.execute("t1", { @@ -434,6 +514,59 @@ describe("image tool MiniMax VLM routing", () => { expect(text).toBe("ok"); }); + it("accepts images[] for multi-image requests", async () => { + const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" }); + + const res = await tool.execute("t1", { + prompt: "Compare these images.", + images: [`data:image/png;base64,${pngB64}`, `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`], + }); + + expect(fetch).toHaveBeenCalledTimes(1); + const details = res.details as + | { + images?: Array<{ image: string }>; + } + | undefined; + expect(details?.images).toHaveLength(2); + }); + + it("combines image + images with dedupe and enforces maxImages", async () => { + const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" }); + + const deduped = await tool.execute("t1", { + prompt: "Compare these images.", + image: `data:image/png;base64,${pngB64}`, + images: [ + `data:image/png;base64,${pngB64}`, + `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`, + `data:image/gif;base64,${ONE_PIXEL_GIF_B64}`, + ], + }); + + expect(fetch).toHaveBeenCalledTimes(1); + const dedupedDetails = deduped.details as + | { + images?: Array<{ image: string }>; + } + | undefined; + expect(dedupedDetails?.images).toHaveLength(2); + + const tooMany = await tool.execute("t2", { + prompt: "Compare these images.", + image: `data:image/png;base64,${pngB64}`, + images: [`data:image/gif;base64,${ONE_PIXEL_GIF_B64}`], + maxImages: 1, + }); + + expect(fetch).toHaveBeenCalledTimes(1); + expect(tooMany.details).toMatchObject({ + error: "too_many_images", + count: 2, + max: 1, + }); + }); + it("surfaces MiniMax API errors from /v1/coding_plan/vlm", async () => { const { tool } = await createMinimaxVlmFixture({ status_code: 1004, status_msg: "bad key" }); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index be92b6ce5ef..55072fb1232 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,7 +1,9 @@ -import path from "node:path"; import { type Api, type Context, complete, type Model } from "@mariozechner/pi-ai"; import { Type } from "@sinclair/typebox"; +import path from "node:path"; import type { OpenClawConfig } from "../../config/config.js"; +import type { SandboxFsBridge } from "../sandbox/fs-bridge.js"; +import type { AnyAgentTool } from "./common.js"; import { resolveUserPath } from "../../utils.js"; import { getDefaultLocalRoots, loadWebMedia } from "../../web/media.js"; import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js"; @@ -12,9 +14,7 @@ import { runWithImageModelFallback } from "../model-fallback.js"; import { resolveConfiguredModelRef } from "../model-selection.js"; import { ensureOpenClawModelsJson } from "../models-config.js"; import { discoverAuthStorage, discoverModels } from "../pi-model-discovery.js"; -import type { SandboxFsBridge } from "../sandbox/fs-bridge.js"; import { normalizeWorkspaceDir } from "../workspace-dir.js"; -import type { AnyAgentTool } from "./common.js"; import { coerceImageAssistantText, coerceImageModelConfig, @@ -358,8 +358,8 @@ export function createImageTool(options?: { // If model has native vision, images in the prompt are auto-injected // so this tool is only needed when image wasn't provided in the prompt const description = options?.modelHasVision - ? "Analyze one or more images with a vision model. Pass a single image path/URL or an array of up to 20. Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." - : "Analyze one or more images with the configured image model (agents.defaults.imageModel). Pass a single image path/URL or an array of up to 20. Provide a prompt describing what to analyze."; + ? "Analyze one or more images with a vision model. Use image for a single path/URL, or images for multiple (up to 20). Only use this tool when images were NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you." + : "Analyze one or more images with the configured image model (agents.defaults.imageModel). Use image for a single path/URL, or images for multiple (up to 20). Provide a prompt describing what to analyze."; const localRoots = (() => { const roots = getDefaultLocalRoots(); @@ -376,7 +376,12 @@ export function createImageTool(options?: { description, parameters: Type.Object({ prompt: Type.Optional(Type.String()), - image: Type.String({ description: "Image path or URL (pass multiple as comma-separated)" }), + image: Type.Optional(Type.String({ description: "Single image path or URL." })), + images: Type.Optional( + Type.Array(Type.String(), { + description: "Multiple image paths or URLs (up to maxImages, default 20).", + }), + ), model: Type.Optional(Type.String()), maxBytesMb: Type.Optional(Type.Number()), maxImages: Type.Optional(Type.Number()), @@ -384,17 +389,28 @@ export function createImageTool(options?: { execute: async (_toolCallId, args) => { const record = args && typeof args === "object" ? (args as Record) : {}; - // MARK: - Normalize image input (string | string[]) - const rawImageInput = record.image; - const imageInputs: string[] = (() => { - if (typeof rawImageInput === "string") { - return [rawImageInput]; + // MARK: - Normalize image + images input and dedupe while preserving order + const imageCandidates: string[] = []; + if (typeof record.image === "string") { + imageCandidates.push(record.image); + } + if (Array.isArray(record.images)) { + imageCandidates.push(...record.images.filter((v): v is string => typeof v === "string")); + } + + const seenImages = new Set(); + const imageInputs: string[] = []; + for (const candidate of imageCandidates) { + const trimmedCandidate = candidate.trim(); + const normalizedForDedupe = trimmedCandidate.startsWith("@") + ? trimmedCandidate.slice(1).trim() + : trimmedCandidate; + if (!normalizedForDedupe || seenImages.has(normalizedForDedupe)) { + continue; } - if (Array.isArray(rawImageInput)) { - return rawImageInput.filter((v): v is string => typeof v === "string"); - } - return []; - })(); + seenImages.add(normalizedForDedupe); + imageInputs.push(trimmedCandidate); + } if (imageInputs.length === 0) { throw new Error("image required"); }