diff --git a/CHANGELOG.md b/CHANGELOG.md index 503555c6162..19ef6fbd2ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212. - Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171. - Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201. - Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated. diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index ca9151e9b73..08e527c349f 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -136,6 +136,9 @@ Rules: - If the active primary image model already supports vision natively, OpenClaw skips the `[Image]` summary block and passes the original image into the model instead. +- If a Gateway/WebChat primary model is text-only, image attachments are + preserved as offloaded `media://inbound/*` refs so the image tool or configured + image model can still inspect them instead of losing the attachment. - Explicit `openclaw infer image describe --model ` requests are different: they run that image-capable provider/model directly, including Ollama refs such as `ollama/qwen2.5vl:7b`. diff --git a/src/gateway/chat-attachments.test.ts b/src/gateway/chat-attachments.test.ts index 439825bb108..2af2fa040d9 100644 --- a/src/gateway/chat-attachments.test.ts +++ b/src/gateway/chat-attachments.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it, vi } from "vitest"; +import { deleteMediaBuffer } from "../media/store.js"; import { buildMessageWithAttachments, type ChatAttachment, @@ -16,6 +17,10 @@ async function parseWithWarnings(message: string, attachments: ChatAttachment[]) return { parsed, logs }; } +async function cleanupOffloadedRefs(refs: { id: string }[]) { + await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound"))); +} + describe("buildMessageWithAttachments", () => { it("embeds a single image as data URL", () => { const msg = buildMessageWithAttachments("see this", [ @@ -137,6 +142,68 @@ describe("parseMessageWithAttachments", () => { expect(parsed.images[0]?.data).toBe(PNG_1x1); expect(logs.some((l) => /non-image/i.test(l))).toBe(true); }); + + it("offloads images for text-only models instead of dropping them", async () => { + const logs: string[] = []; + const infos: string[] = []; + const parsed = await parseMessageWithAttachments( + "see this", + [ + { + type: "image", + mimeType: "image/png", + fileName: "dot.png", + content: PNG_1x1, + }, + ], + { + log: { info: (message) => infos.push(message), warn: (warning) => logs.push(warning) }, + supportsImages: false, + }, + ); + + try { + expect(parsed.images).toHaveLength(0); + expect(parsed.imageOrder).toEqual(["offloaded"]); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.mimeType).toBe("image/png"); + expect(parsed.message).toMatch(/^see this\n\[media attached: media:\/\/inbound\//); + expect(infos[0]).toMatch(/Offloaded image for text-only model/i); + expect(logs).toHaveLength(0); + } finally { + await cleanupOffloadedRefs(parsed.offloadedRefs); + } + }); + + it("caps text-only image offloads", async () => { + const logs: string[] = []; + const attachments = Array.from( + { length: 11 }, + (_, index): ChatAttachment => ({ + type: "image", + mimeType: "image/png", + fileName: `dot-${index}.png`, + content: PNG_1x1, + }), + ); + const parsed = await parseMessageWithAttachments("see these", attachments, { + log: { warn: (warning) => logs.push(warning) }, + supportsImages: false, + }); + + try { + expect(parsed.images).toHaveLength(0); + expect(parsed.offloadedRefs).toHaveLength(10); + expect(parsed.imageOrder).toHaveLength(10); + expect(parsed.message.match(/\[media attached: media:\/\/inbound\//g)).toHaveLength(10); + expect(parsed.message).toContain( + "[image attachment omitted: text-only attachment limit reached]", + ); + expect(logs.some((line) => /offload limit 10/i.test(line))).toBe(true); + } finally { + await cleanupOffloadedRefs(parsed.offloadedRefs); + } + }); }); describe("shared attachment validation", () => { diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 18cbdc02c5b..79b86f09851 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -59,8 +59,8 @@ export type ParsedMessageWithImages = { * do not receive these as inline image blocks. * * ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass - * `supportsImages: modelSupportsImages(model)` so that text-only model runs - * do not inject unresolvable media:// markers into prompt text. + * `supportsImages: modelSupportsImages(model)` so text-only model runs + * offload images as media refs instead of passing inline image blocks. */ offloadedRefs: OffloadedRef[]; }; @@ -82,6 +82,7 @@ type SavedMedia = { }; const OFFLOAD_THRESHOLD_BYTES = 2_000_000; +const TEXT_ONLY_OFFLOAD_LIMIT = 10; const MIME_TO_EXT: Record = { "image/jpeg": ".jpg", @@ -271,12 +272,9 @@ function validateAttachmentBase64OrThrow( * because they are not passed inline to the model. * * ## Text-only model runs - * Pass `supportsImages: false` for text-only model runs so that no media:// - * markers are injected into prompt text. - * - * ⚠️ Call sites in chat.ts, agent.ts, and server-node-events.ts MUST be - * updated to pass `supportsImages: modelSupportsImages(model)`. Until they do, - * text-only model runs receive unresolvable media:// markers in their prompt. + * Pass `supportsImages: false` for text-only model runs so images are offloaded + * as `media://inbound/` refs instead of being sent as inline image blocks. + * The agent runner can then resolve the refs through the normal media path. * * ## Cleanup on failure * On any parse failure after files have already been offloaded, best-effort @@ -304,22 +302,11 @@ export async function parseMessageWithAttachments( return { message, images: [], imageOrder: [], offloadedRefs: [] }; } - // For text-only models drop all attachments cleanly. Do not save files or - // inject media:// markers that would never be resolved and would leak - // internal path references into the model's prompt. - if (opts?.supportsImages === false) { - if (attachments.length > 0) { - log?.warn( - `parseMessageWithAttachments: ${attachments.length} attachment(s) dropped — model does not support images`, - ); - } - return { message, images: [], imageOrder: [], offloadedRefs: [] }; - } - const images: ChatImageContent[] = []; const imageOrder: PromptImageOrderEntry[] = []; const offloadedRefs: OffloadedRef[] = []; let updatedMessage = message; + const shouldForceOffload = opts?.supportsImages === false; // Track IDs of files saved during this request for cleanup if a later // attachment fails validation and the entire parse is aborted. @@ -377,10 +364,26 @@ export async function parseMessageWithAttachments( let isOffloaded = false; - if (sizeBytes > OFFLOAD_THRESHOLD_BYTES) { + if (shouldForceOffload && offloadedRefs.length >= TEXT_ONLY_OFFLOAD_LIMIT) { + log?.warn( + `attachment ${label}: dropping image because text-only offload limit ` + + `${TEXT_ONLY_OFFLOAD_LIMIT} was reached`, + ); + updatedMessage += "\n[image attachment omitted: text-only attachment limit reached]"; + continue; + } + + if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) { const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime); if (!isSupportedForOffload) { + if (shouldForceOffload) { + log?.warn( + `attachment ${label}: format ${finalMime} cannot be offloaded for ` + + "text-only model, dropping", + ); + continue; + } // Passing this inline would reintroduce the OOM risk this PR prevents. throw new Error( `attachment ${label}: format ${finalMime} is too large to pass inline ` + @@ -418,7 +421,11 @@ export async function parseMessageWithAttachments( const mediaRef = `media://inbound/${savedMedia.id}`; updatedMessage += `\n[media attached: ${mediaRef}]`; - log?.info?.(`[Gateway] Intercepted large image payload. Saved: ${mediaRef}`); + log?.info?.( + shouldForceOffload + ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}` + : `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`, + ); // Record for transcript metadata — separate from `images` because // these are not passed inline to the model. diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index 8e515fa45a6..425b0f35be6 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -2087,7 +2087,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]"); }); - it("drops image attachments for text-only session models", async () => { + it("offloads image attachments for text-only session models", async () => { createTranscriptFixture("openclaw-chat-send-text-only-attachments-"); mockState.finalText = "ok"; mockState.sessionEntry = { @@ -2123,7 +2123,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () => }); expect(mockState.lastDispatchImages).toBeUndefined(); - expect(mockState.lastDispatchImageOrder).toBeUndefined(); + expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]); + expect(mockState.lastDispatchCtx?.Body).toMatch( + /^describe image\n\[media attached: media:\/\/inbound\//, + ); + expect(mockState.savedMediaCalls).toEqual([ + expect.objectContaining({ contentType: "image/png", subdir: "inbound" }), + ]); }); it("keeps image attachments for text-only sessions bound to ACP", async () => { @@ -2231,7 +2237,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () => }); expect(mockState.lastDispatchImages).toBeUndefined(); - expect(mockState.lastDispatchImageOrder).toBeUndefined(); + expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]); + expect(mockState.lastDispatchCtx?.Body).toMatch( + /^describe image\n\[media attached: media:\/\/inbound\//, + ); + expect(mockState.savedMediaCalls).toEqual([ + expect.objectContaining({ contentType: "image/png", subdir: "inbound" }), + ]); }); it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {