mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 15:40:44 +00:00
fix: preserve gateway image refs for text-only models
This commit is contained in:
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
|
||||
- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
|
||||
- Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
|
||||
- Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.
|
||||
|
||||
@@ -136,6 +136,9 @@ Rules:
|
||||
- If the active primary image model already supports vision natively, OpenClaw
|
||||
skips the `[Image]` summary block and passes the original image into the
|
||||
model instead.
|
||||
- If a Gateway/WebChat primary model is text-only, image attachments are
|
||||
preserved as offloaded `media://inbound/*` refs so the image tool or configured
|
||||
image model can still inspect them instead of losing the attachment.
|
||||
- Explicit `openclaw infer image describe --model <provider/model>` requests
|
||||
are different: they run that image-capable provider/model directly, including
|
||||
Ollama refs such as `ollama/qwen2.5vl:7b`.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { deleteMediaBuffer } from "../media/store.js";
|
||||
import {
|
||||
buildMessageWithAttachments,
|
||||
type ChatAttachment,
|
||||
@@ -16,6 +17,10 @@ async function parseWithWarnings(message: string, attachments: ChatAttachment[])
|
||||
return { parsed, logs };
|
||||
}
|
||||
|
||||
async function cleanupOffloadedRefs(refs: { id: string }[]) {
|
||||
await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
|
||||
}
|
||||
|
||||
describe("buildMessageWithAttachments", () => {
|
||||
it("embeds a single image as data URL", () => {
|
||||
const msg = buildMessageWithAttachments("see this", [
|
||||
@@ -137,6 +142,68 @@ describe("parseMessageWithAttachments", () => {
|
||||
expect(parsed.images[0]?.data).toBe(PNG_1x1);
|
||||
expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
|
||||
});
|
||||
|
||||
it("offloads images for text-only models instead of dropping them", async () => {
|
||||
const logs: string[] = [];
|
||||
const infos: string[] = [];
|
||||
const parsed = await parseMessageWithAttachments(
|
||||
"see this",
|
||||
[
|
||||
{
|
||||
type: "image",
|
||||
mimeType: "image/png",
|
||||
fileName: "dot.png",
|
||||
content: PNG_1x1,
|
||||
},
|
||||
],
|
||||
{
|
||||
log: { info: (message) => infos.push(message), warn: (warning) => logs.push(warning) },
|
||||
supportsImages: false,
|
||||
},
|
||||
);
|
||||
|
||||
try {
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(parsed.imageOrder).toEqual(["offloaded"]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe("image/png");
|
||||
expect(parsed.message).toMatch(/^see this\n\[media attached: media:\/\/inbound\//);
|
||||
expect(infos[0]).toMatch(/Offloaded image for text-only model/i);
|
||||
expect(logs).toHaveLength(0);
|
||||
} finally {
|
||||
await cleanupOffloadedRefs(parsed.offloadedRefs);
|
||||
}
|
||||
});
|
||||
|
||||
it("caps text-only image offloads", async () => {
|
||||
const logs: string[] = [];
|
||||
const attachments = Array.from(
|
||||
{ length: 11 },
|
||||
(_, index): ChatAttachment => ({
|
||||
type: "image",
|
||||
mimeType: "image/png",
|
||||
fileName: `dot-${index}.png`,
|
||||
content: PNG_1x1,
|
||||
}),
|
||||
);
|
||||
const parsed = await parseMessageWithAttachments("see these", attachments, {
|
||||
log: { warn: (warning) => logs.push(warning) },
|
||||
supportsImages: false,
|
||||
});
|
||||
|
||||
try {
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(parsed.offloadedRefs).toHaveLength(10);
|
||||
expect(parsed.imageOrder).toHaveLength(10);
|
||||
expect(parsed.message.match(/\[media attached: media:\/\/inbound\//g)).toHaveLength(10);
|
||||
expect(parsed.message).toContain(
|
||||
"[image attachment omitted: text-only attachment limit reached]",
|
||||
);
|
||||
expect(logs.some((line) => /offload limit 10/i.test(line))).toBe(true);
|
||||
} finally {
|
||||
await cleanupOffloadedRefs(parsed.offloadedRefs);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("shared attachment validation", () => {
|
||||
|
||||
@@ -59,8 +59,8 @@ export type ParsedMessageWithImages = {
|
||||
* do not receive these as inline image blocks.
|
||||
*
|
||||
* ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
|
||||
* `supportsImages: modelSupportsImages(model)` so that text-only model runs
|
||||
* do not inject unresolvable media:// markers into prompt text.
|
||||
* `supportsImages: modelSupportsImages(model)` so text-only model runs
|
||||
* offload images as media refs instead of passing inline image blocks.
|
||||
*/
|
||||
offloadedRefs: OffloadedRef[];
|
||||
};
|
||||
@@ -82,6 +82,7 @@ type SavedMedia = {
|
||||
};
|
||||
|
||||
const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
|
||||
const TEXT_ONLY_OFFLOAD_LIMIT = 10;
|
||||
|
||||
const MIME_TO_EXT: Record<string, string> = {
|
||||
"image/jpeg": ".jpg",
|
||||
@@ -271,12 +272,9 @@ function validateAttachmentBase64OrThrow(
|
||||
* because they are not passed inline to the model.
|
||||
*
|
||||
* ## Text-only model runs
|
||||
* Pass `supportsImages: false` for text-only model runs so that no media://
|
||||
* markers are injected into prompt text.
|
||||
*
|
||||
* ⚠️ Call sites in chat.ts, agent.ts, and server-node-events.ts MUST be
|
||||
* updated to pass `supportsImages: modelSupportsImages(model)`. Until they do,
|
||||
* text-only model runs receive unresolvable media:// markers in their prompt.
|
||||
* Pass `supportsImages: false` for text-only model runs so images are offloaded
|
||||
* as `media://inbound/<id>` refs instead of being sent as inline image blocks.
|
||||
* The agent runner can then resolve the refs through the normal media path.
|
||||
*
|
||||
* ## Cleanup on failure
|
||||
* On any parse failure after files have already been offloaded, best-effort
|
||||
@@ -304,22 +302,11 @@ export async function parseMessageWithAttachments(
|
||||
return { message, images: [], imageOrder: [], offloadedRefs: [] };
|
||||
}
|
||||
|
||||
// For text-only models drop all attachments cleanly. Do not save files or
|
||||
// inject media:// markers that would never be resolved and would leak
|
||||
// internal path references into the model's prompt.
|
||||
if (opts?.supportsImages === false) {
|
||||
if (attachments.length > 0) {
|
||||
log?.warn(
|
||||
`parseMessageWithAttachments: ${attachments.length} attachment(s) dropped — model does not support images`,
|
||||
);
|
||||
}
|
||||
return { message, images: [], imageOrder: [], offloadedRefs: [] };
|
||||
}
|
||||
|
||||
const images: ChatImageContent[] = [];
|
||||
const imageOrder: PromptImageOrderEntry[] = [];
|
||||
const offloadedRefs: OffloadedRef[] = [];
|
||||
let updatedMessage = message;
|
||||
const shouldForceOffload = opts?.supportsImages === false;
|
||||
|
||||
// Track IDs of files saved during this request for cleanup if a later
|
||||
// attachment fails validation and the entire parse is aborted.
|
||||
@@ -377,10 +364,26 @@ export async function parseMessageWithAttachments(
|
||||
|
||||
let isOffloaded = false;
|
||||
|
||||
if (sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
|
||||
if (shouldForceOffload && offloadedRefs.length >= TEXT_ONLY_OFFLOAD_LIMIT) {
|
||||
log?.warn(
|
||||
`attachment ${label}: dropping image because text-only offload limit ` +
|
||||
`${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
|
||||
);
|
||||
updatedMessage += "\n[image attachment omitted: text-only attachment limit reached]";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
|
||||
const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);
|
||||
|
||||
if (!isSupportedForOffload) {
|
||||
if (shouldForceOffload) {
|
||||
log?.warn(
|
||||
`attachment ${label}: format ${finalMime} cannot be offloaded for ` +
|
||||
"text-only model, dropping",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
// Passing this inline would reintroduce the OOM risk this PR prevents.
|
||||
throw new Error(
|
||||
`attachment ${label}: format ${finalMime} is too large to pass inline ` +
|
||||
@@ -418,7 +421,11 @@ export async function parseMessageWithAttachments(
|
||||
const mediaRef = `media://inbound/${savedMedia.id}`;
|
||||
|
||||
updatedMessage += `\n[media attached: ${mediaRef}]`;
|
||||
log?.info?.(`[Gateway] Intercepted large image payload. Saved: ${mediaRef}`);
|
||||
log?.info?.(
|
||||
shouldForceOffload
|
||||
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
|
||||
: `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
|
||||
);
|
||||
|
||||
// Record for transcript metadata — separate from `images` because
|
||||
// these are not passed inline to the model.
|
||||
|
||||
@@ -2087,7 +2087,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
|
||||
});
|
||||
|
||||
it("drops image attachments for text-only session models", async () => {
|
||||
it("offloads image attachments for text-only session models", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-text-only-attachments-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
@@ -2123,7 +2123,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
|
||||
expect(mockState.lastDispatchCtx?.Body).toMatch(
|
||||
/^describe image\n\[media attached: media:\/\/inbound\//,
|
||||
);
|
||||
expect(mockState.savedMediaCalls).toEqual([
|
||||
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps image attachments for text-only sessions bound to ACP", async () => {
|
||||
@@ -2231,7 +2237,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
|
||||
expect(mockState.lastDispatchCtx?.Body).toMatch(
|
||||
/^describe image\n\[media attached: media:\/\/inbound\//,
|
||||
);
|
||||
expect(mockState.savedMediaCalls).toEqual([
|
||||
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
|
||||
]);
|
||||
});
|
||||
|
||||
it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
|
||||
|
||||
Reference in New Issue
Block a user