From 4eb30fc13a5b5269a87f216c40f7ce2dba1eac6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=BE=89=E5=93=A5?= <13723130812@qq.com> Date: Thu, 30 Apr 2026 03:13:19 +0800 Subject: [PATCH] fix(media): surface vision pipeline diagnostics * fix: improve error message in optimizeImageToJpeg to include actual error details * fix: improve error message to include configured input for Model does not support images * fix(media): surface vision pipeline diagnostics --------- Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + src/media-understanding/image.test.ts | 29 +++++++++++++++++++++++++++ src/media-understanding/image.ts | 9 ++++++++- src/media/web-media.test.ts | 9 ++++++++- src/media/web-media.ts | 13 ++++++++++-- 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16ce400c773..ce193fc21c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Media: include redacted per-attempt resize failures and resolved model input capabilities in vision-pipeline errors so ARM64 image failures are diagnosable without closing the remaining routing investigation. Refs #74552. Thanks @1yihui. - Auto-reply: honor explicit `silentReply.direct: "allow"` for clean empty or reasoning-only direct chat turns while keeping the default direct-chat empty-response guard conservative. Fixes #74409. Thanks @jesuskannolis. - OpenAI Codex: send a non-empty Responses input item when a Codex turn only has systemPrompt-backed instructions, avoiding ChatGPT backend 400s from `input: []`. Fixes #73820. Thanks @woodhouse-bot. - Ollama: normalize provider-prefixed tool-call names at the native stream boundary so Kimi/Ollama calls such as `functions.exec` dispatch as `exec` instead of missing configured tools. Fixes #74487. Thanks @afurm and @carreipeia. diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 6d2e3144099..15d4b8185cb 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -296,6 +296,35 @@ describe("describeImageWithModel", () => { expect(completeMock).toHaveBeenCalledOnce(); }); + it("reports the resolved model input when an image model is text-only", async () => { + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => ({ + provider: "lmstudio", + id: "text-only", + api: "openai-completions", + input: ["text"], + baseUrl: "http://127.0.0.1:1234", + })), + }); + + await expect( + describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "lmstudio", + model: "text-only", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }), + ).rejects.toThrow( + "Model does not support images: lmstudio/text-only (resolved lmstudio/text-only input: text)", + ); + expect(completeMock).not.toHaveBeenCalled(); + }); + it("passes image prompt as system instructions for codex image requests", async () => { discoverModelsMock.mockReturnValue({ find: vi.fn(() => ({ diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index dff31f23efb..35cc3f26dc8 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -64,6 +64,10 @@ function isNativeResponsesReasoningPayload(model: Model): boolean { }).usesKnownNativeOpenAIRoute; } +function formatModelInputCapabilities(input: Model["input"] | undefined): string { + return input && input.length > 0 ? input.join(", ") : "none"; +} + function removeReasoningInclude(value: unknown): unknown { if (!Array.isArray(value)) { return value; @@ -192,7 +196,10 @@ async function resolveImageRuntime(params: { if (isMinimaxVlmModel(resolvedRef.provider, resolvedRef.model)) { throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`); } - throw new Error(`Model does not support images: ${params.provider}/${params.model}`); + throw new Error( + `Model does not support images: ${params.provider}/${params.model} ` + + `(resolved ${model.provider}/${model.id} input: ${formatModelInputCapabilities(model.input)})`, + ); } const apiKeyInfo = await getApiKeyForModel({ model, diff --git a/src/media/web-media.test.ts b/src/media/web-media.test.ts index 2a052d3a542..3b4926bd346 100644 --- a/src/media/web-media.test.ts +++ b/src/media/web-media.test.ts @@ -7,6 +7,7 @@ import { resolveStateDir } from "../config/paths.js"; import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; let loadWebMedia: typeof import("./web-media.js").loadWebMedia; +let optimizeImageToJpeg: typeof import("./web-media.js").optimizeImageToJpeg; const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; @@ -19,7 +20,7 @@ let workspaceDir = ""; let workspacePngFile = ""; beforeAll(async () => { - ({ loadWebMedia } = await import("./web-media.js")); + ({ loadWebMedia, optimizeImageToJpeg } = await import("./web-media.js")); fixtureRoot = await fs.mkdtemp(path.join(resolvePreferredOpenClawTmpDir(), "web-media-core-")); tinyPngFile = path.join(fixtureRoot, "tiny.png"); await fs.writeFile(tinyPngFile, Buffer.from(TINY_PNG_BASE64, "base64")); @@ -156,6 +157,12 @@ describe("loadWebMedia", () => { expect(result.buffer.length).toBeGreaterThan(0); }); + it("includes resize failure details when image optimization cannot produce a JPEG", async () => { + await expect(optimizeImageToJpeg(Buffer.from("not an image"), 8)).rejects.toThrow( + /Failed to optimize image: .+/, + ); + }); + it("resolves relative local media paths against the provided workspace directory", async () => { const result = await loadWebMedia("chart.png", { maxBytes: 1024 * 1024, diff --git a/src/media/web-media.ts b/src/media/web-media.ts index de84f3ee7b3..e7bf75c440e 100644 --- a/src/media/web-media.ts +++ b/src/media/web-media.ts @@ -1,6 +1,7 @@ import path from "node:path"; import { resolveCanvasHttpPathToLocalPath } from "../gateway/canvas-documents.js"; import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { formatErrorMessage } from "../infra/errors.js"; import { SafeOpenError, readLocalFileSafely } from "../infra/fs-safe.js"; import { assertNoWindowsNetworkPath, safeFileURLToPath } from "../infra/local-file-access.js"; import type { PinnedDispatcherPolicy, SsrFPolicy } from "../infra/net/ssrf.js"; @@ -616,6 +617,8 @@ export async function optimizeImageToJpeg( resizeSide: number; quality: number; } | null = null; + let firstResizeError: unknown; + const errors: string[] = []; for (const side of sides) { for (const quality of qualities) { @@ -638,7 +641,12 @@ export async function optimizeImageToJpeg( quality, }; } - } catch { + } catch (err) { + firstResizeError ??= err; + const message = formatErrorMessage(err).trim(); + if (message && !errors.includes(message)) { + errors.push(message); + } // Continue trying other size/quality combinations } } @@ -653,7 +661,8 @@ export async function optimizeImageToJpeg( }; } - throw new Error("Failed to optimize image"); + const detail = errors.length > 0 ? `: ${errors.slice(0, 3).join("; ")}` : ""; + throw new Error(`Failed to optimize image${detail}`, { cause: firstResizeError }); } export { optimizeImageToPng };