fix: preserve gateway image refs for text-only models

This commit is contained in:
Peter Steinberger
2026-04-24 02:40:01 +01:00
parent 92a42413df
commit 86f69ba5a0
5 changed files with 115 additions and 25 deletions

View File

@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
- Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
- Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.

View File

@@ -136,6 +136,9 @@ Rules:
- If the active primary image model already supports vision natively, OpenClaw
skips the `[Image]` summary block and passes the original image into the
model instead.
- If a Gateway/WebChat primary model is text-only, image attachments are
preserved as offloaded `media://inbound/*` refs so the image tool or configured
image model can still inspect them instead of losing the attachment.
- Explicit `openclaw infer image describe --model <provider/model>` requests
are different: they run that image-capable provider/model directly, including
Ollama refs such as `ollama/qwen2.5vl:7b`.

View File

@@ -1,4 +1,5 @@
import { describe, expect, it, vi } from "vitest";
import { deleteMediaBuffer } from "../media/store.js";
import {
buildMessageWithAttachments,
type ChatAttachment,
@@ -16,6 +17,10 @@ async function parseWithWarnings(message: string, attachments: ChatAttachment[])
return { parsed, logs };
}
async function cleanupOffloadedRefs(refs: { id: string }[]) {
await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
}
describe("buildMessageWithAttachments", () => {
it("embeds a single image as data URL", () => {
const msg = buildMessageWithAttachments("see this", [
@@ -137,6 +142,68 @@ describe("parseMessageWithAttachments", () => {
expect(parsed.images[0]?.data).toBe(PNG_1x1);
expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
});
it("offloads images for text-only models instead of dropping them", async () => {
const logs: string[] = [];
const infos: string[] = [];
const parsed = await parseMessageWithAttachments(
"see this",
[
{
type: "image",
mimeType: "image/png",
fileName: "dot.png",
content: PNG_1x1,
},
],
{
log: { info: (message) => infos.push(message), warn: (warning) => logs.push(warning) },
supportsImages: false,
},
);
try {
expect(parsed.images).toHaveLength(0);
expect(parsed.imageOrder).toEqual(["offloaded"]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe("image/png");
expect(parsed.message).toMatch(/^see this\n\[media attached: media:\/\/inbound\//);
expect(infos[0]).toMatch(/Offloaded image for text-only model/i);
expect(logs).toHaveLength(0);
} finally {
await cleanupOffloadedRefs(parsed.offloadedRefs);
}
});
it("caps text-only image offloads", async () => {
const logs: string[] = [];
const attachments = Array.from(
{ length: 11 },
(_, index): ChatAttachment => ({
type: "image",
mimeType: "image/png",
fileName: `dot-${index}.png`,
content: PNG_1x1,
}),
);
const parsed = await parseMessageWithAttachments("see these", attachments, {
log: { warn: (warning) => logs.push(warning) },
supportsImages: false,
});
try {
expect(parsed.images).toHaveLength(0);
expect(parsed.offloadedRefs).toHaveLength(10);
expect(parsed.imageOrder).toHaveLength(10);
expect(parsed.message.match(/\[media attached: media:\/\/inbound\//g)).toHaveLength(10);
expect(parsed.message).toContain(
"[image attachment omitted: text-only attachment limit reached]",
);
expect(logs.some((line) => /offload limit 10/i.test(line))).toBe(true);
} finally {
await cleanupOffloadedRefs(parsed.offloadedRefs);
}
});
});
describe("shared attachment validation", () => {

View File

@@ -59,8 +59,8 @@ export type ParsedMessageWithImages = {
* do not receive these as inline image blocks.
*
* ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
* `supportsImages: modelSupportsImages(model)` so that text-only model runs
* do not inject unresolvable media:// markers into prompt text.
* `supportsImages: modelSupportsImages(model)` so text-only model runs
* offload images as media refs instead of passing inline image blocks.
*/
offloadedRefs: OffloadedRef[];
};
@@ -82,6 +82,7 @@ type SavedMedia = {
};
const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
const TEXT_ONLY_OFFLOAD_LIMIT = 10;
const MIME_TO_EXT: Record<string, string> = {
"image/jpeg": ".jpg",
@@ -271,12 +272,9 @@ function validateAttachmentBase64OrThrow(
* because they are not passed inline to the model.
*
* ## Text-only model runs
* Pass `supportsImages: false` for text-only model runs so that no media://
* markers are injected into prompt text.
*
* ⚠️ Call sites in chat.ts, agent.ts, and server-node-events.ts MUST be
* updated to pass `supportsImages: modelSupportsImages(model)`. Until they do,
* text-only model runs receive unresolvable media:// markers in their prompt.
* Pass `supportsImages: false` for text-only model runs so images are offloaded
* as `media://inbound/<id>` refs instead of being sent as inline image blocks.
* The agent runner can then resolve the refs through the normal media path.
*
* ## Cleanup on failure
* On any parse failure after files have already been offloaded, best-effort
@@ -304,22 +302,11 @@ export async function parseMessageWithAttachments(
return { message, images: [], imageOrder: [], offloadedRefs: [] };
}
// For text-only models drop all attachments cleanly. Do not save files or
// inject media:// markers that would never be resolved and would leak
// internal path references into the model's prompt.
if (opts?.supportsImages === false) {
if (attachments.length > 0) {
log?.warn(
`parseMessageWithAttachments: ${attachments.length} attachment(s) dropped — model does not support images`,
);
}
return { message, images: [], imageOrder: [], offloadedRefs: [] };
}
const images: ChatImageContent[] = [];
const imageOrder: PromptImageOrderEntry[] = [];
const offloadedRefs: OffloadedRef[] = [];
let updatedMessage = message;
const shouldForceOffload = opts?.supportsImages === false;
// Track IDs of files saved during this request for cleanup if a later
// attachment fails validation and the entire parse is aborted.
@@ -377,10 +364,26 @@ export async function parseMessageWithAttachments(
let isOffloaded = false;
if (sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
if (shouldForceOffload && offloadedRefs.length >= TEXT_ONLY_OFFLOAD_LIMIT) {
log?.warn(
`attachment ${label}: dropping image because text-only offload limit ` +
`${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
);
updatedMessage += "\n[image attachment omitted: text-only attachment limit reached]";
continue;
}
if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);
if (!isSupportedForOffload) {
if (shouldForceOffload) {
log?.warn(
`attachment ${label}: format ${finalMime} cannot be offloaded for ` +
"text-only model, dropping",
);
continue;
}
// Passing this inline would reintroduce the OOM risk this PR prevents.
throw new Error(
`attachment ${label}: format ${finalMime} is too large to pass inline ` +
@@ -418,7 +421,11 @@ export async function parseMessageWithAttachments(
const mediaRef = `media://inbound/${savedMedia.id}`;
updatedMessage += `\n[media attached: ${mediaRef}]`;
log?.info?.(`[Gateway] Intercepted large image payload. Saved: ${mediaRef}`);
log?.info?.(
shouldForceOffload
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
: `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
);
// Record for transcript metadata — separate from `images` because
// these are not passed inline to the model.

View File

@@ -2087,7 +2087,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
});
it("drops image attachments for text-only session models", async () => {
it("offloads image attachments for text-only session models", async () => {
createTranscriptFixture("openclaw-chat-send-text-only-attachments-");
mockState.finalText = "ok";
mockState.sessionEntry = {
@@ -2123,7 +2123,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
expect(mockState.lastDispatchImages).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
expect(mockState.lastDispatchCtx?.Body).toMatch(
/^describe image\n\[media attached: media:\/\/inbound\//,
);
expect(mockState.savedMediaCalls).toEqual([
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
]);
});
it("keeps image attachments for text-only sessions bound to ACP", async () => {
@@ -2231,7 +2237,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
expect(mockState.lastDispatchImages).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
expect(mockState.lastDispatchCtx?.Body).toMatch(
/^describe image\n\[media attached: media:\/\/inbound\//,
);
expect(mockState.savedMediaCalls).toEqual([
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
]);
});
it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {