fix: preserve gateway image refs for text-only models

2026-05-06 15:40:44 +00:00 · 2026-04-24 02:40:01 +01:00
parent 92a42413df
commit 86f69ba5a0
5 changed files with 115 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
 - Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
 - Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
 - Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -136,6 +136,9 @@ Rules:
 - If the active primary image model already supports vision natively, OpenClaw
  skips the `[Image]` summary block and passes the original image into the
  model instead.
+- If a Gateway/WebChat primary model is text-only, image attachments are
+  preserved as offloaded `media://inbound/*` refs so the image tool or configured
+  image model can still inspect them instead of losing the attachment.
 - Explicit `openclaw infer image describe --model <provider/model>` requests
  are different: they run that image-capable provider/model directly, including
  Ollama refs such as `ollama/qwen2.5vl:7b`.
--- a/src/gateway/chat-attachments.test.ts
+++ b/src/gateway/chat-attachments.test.ts
@@ -1,4 +1,5 @@
 import { describe, expect, it, vi } from "vitest";
+import { deleteMediaBuffer } from "../media/store.js";
 import {
  buildMessageWithAttachments,
  type ChatAttachment,
@@ -16,6 +17,10 @@ async function parseWithWarnings(message: string, attachments: ChatAttachment[])
  return { parsed, logs };
 }

+async function cleanupOffloadedRefs(refs: { id: string }[]) {
+  await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
+}
+
 describe("buildMessageWithAttachments", () => {
  it("embeds a single image as data URL", () => {
    const msg = buildMessageWithAttachments("see this", [
@@ -137,6 +142,68 @@ describe("parseMessageWithAttachments", () => {
    expect(parsed.images[0]?.data).toBe(PNG_1x1);
    expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
  });
+
+  it("offloads images for text-only models instead of dropping them", async () => {
+    const logs: string[] = [];
+    const infos: string[] = [];
+    const parsed = await parseMessageWithAttachments(
+      "see this",
+      [
+        {
+          type: "image",
+          mimeType: "image/png",
+          fileName: "dot.png",
+          content: PNG_1x1,
+        },
+      ],
+      {
+        log: { info: (message) => infos.push(message), warn: (warning) => logs.push(warning) },
+        supportsImages: false,
+      },
+    );
+
+    try {
+      expect(parsed.images).toHaveLength(0);
+      expect(parsed.imageOrder).toEqual(["offloaded"]);
+      expect(parsed.offloadedRefs).toHaveLength(1);
+      expect(parsed.offloadedRefs[0]?.mimeType).toBe("image/png");
+      expect(parsed.message).toMatch(/^see this\n\[media attached: media:\/\/inbound\//);
+      expect(infos[0]).toMatch(/Offloaded image for text-only model/i);
+      expect(logs).toHaveLength(0);
+    } finally {
+      await cleanupOffloadedRefs(parsed.offloadedRefs);
+    }
+  });
+
+  it("caps text-only image offloads", async () => {
+    const logs: string[] = [];
+    const attachments = Array.from(
+      { length: 11 },
+      (_, index): ChatAttachment => ({
+        type: "image",
+        mimeType: "image/png",
+        fileName: `dot-${index}.png`,
+        content: PNG_1x1,
+      }),
+    );
+    const parsed = await parseMessageWithAttachments("see these", attachments, {
+      log: { warn: (warning) => logs.push(warning) },
+      supportsImages: false,
+    });
+
+    try {
+      expect(parsed.images).toHaveLength(0);
+      expect(parsed.offloadedRefs).toHaveLength(10);
+      expect(parsed.imageOrder).toHaveLength(10);
+      expect(parsed.message.match(/\[media attached: media:\/\/inbound\//g)).toHaveLength(10);
+      expect(parsed.message).toContain(
+        "[image attachment omitted: text-only attachment limit reached]",
+      );
+      expect(logs.some((line) => /offload limit 10/i.test(line))).toBe(true);
+    } finally {
+      await cleanupOffloadedRefs(parsed.offloadedRefs);
+    }
+  });
 });

 describe("shared attachment validation", () => {
--- a/src/gateway/chat-attachments.ts
+++ b/src/gateway/chat-attachments.ts
@@ -59,8 +59,8 @@ export type ParsedMessageWithImages = {
   * do not receive these as inline image blocks.
   *
   * ⚠️  Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
-   * `supportsImages: modelSupportsImages(model)` so that text-only model runs
-   * do not inject unresolvable media:// markers into prompt text.
+   * `supportsImages: modelSupportsImages(model)` so text-only model runs
+   * offload images as media refs instead of passing inline image blocks.
   */
  offloadedRefs: OffloadedRef[];
 };
@@ -82,6 +82,7 @@ type SavedMedia = {
 };

 const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
+const TEXT_ONLY_OFFLOAD_LIMIT = 10;

 const MIME_TO_EXT: Record<string, string> = {
  "image/jpeg": ".jpg",
@@ -271,12 +272,9 @@ function validateAttachmentBase64OrThrow(
 * because they are not passed inline to the model.
 *
 * ## Text-only model runs
- * Pass `supportsImages: false` for text-only model runs so that no media://
- * markers are injected into prompt text.
- *
- * ⚠️  Call sites in chat.ts, agent.ts, and server-node-events.ts MUST be
- * updated to pass `supportsImages: modelSupportsImages(model)`. Until they do,
- * text-only model runs receive unresolvable media:// markers in their prompt.
+ * Pass `supportsImages: false` for text-only model runs so images are offloaded
+ * as `media://inbound/<id>` refs instead of being sent as inline image blocks.
+ * The agent runner can then resolve the refs through the normal media path.
 *
 * ## Cleanup on failure
 * On any parse failure after files have already been offloaded, best-effort
@@ -304,22 +302,11 @@ export async function parseMessageWithAttachments(
    return { message, images: [], imageOrder: [], offloadedRefs: [] };
  }

-  // For text-only models drop all attachments cleanly. Do not save files or
-  // inject media:// markers that would never be resolved and would leak
-  // internal path references into the model's prompt.
-  if (opts?.supportsImages === false) {
-    if (attachments.length > 0) {
-      log?.warn(
-        `parseMessageWithAttachments: ${attachments.length} attachment(s) dropped — model does not support images`,
-      );
-    }
-    return { message, images: [], imageOrder: [], offloadedRefs: [] };
-  }
-
  const images: ChatImageContent[] = [];
  const imageOrder: PromptImageOrderEntry[] = [];
  const offloadedRefs: OffloadedRef[] = [];
  let updatedMessage = message;
+  const shouldForceOffload = opts?.supportsImages === false;

  // Track IDs of files saved during this request for cleanup if a later
  // attachment fails validation and the entire parse is aborted.
@@ -377,10 +364,26 @@ export async function parseMessageWithAttachments(

      let isOffloaded = false;

-      if (sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
+      if (shouldForceOffload && offloadedRefs.length >= TEXT_ONLY_OFFLOAD_LIMIT) {
+        log?.warn(
+          `attachment ${label}: dropping image because text-only offload limit ` +
+            `${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
+        );
+        updatedMessage += "\n[image attachment omitted: text-only attachment limit reached]";
+        continue;
+      }
+
+      if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
        const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);

        if (!isSupportedForOffload) {
+          if (shouldForceOffload) {
+            log?.warn(
+              `attachment ${label}: format ${finalMime} cannot be offloaded for ` +
+                "text-only model, dropping",
+            );
+            continue;
+          }
          // Passing this inline would reintroduce the OOM risk this PR prevents.
          throw new Error(
            `attachment ${label}: format ${finalMime} is too large to pass inline ` +
@@ -418,7 +421,11 @@ export async function parseMessageWithAttachments(
          const mediaRef = `media://inbound/${savedMedia.id}`;

          updatedMessage += `\n[media attached: ${mediaRef}]`;
-          log?.info?.(`[Gateway] Intercepted large image payload. Saved: ${mediaRef}`);
+          log?.info?.(
+            shouldForceOffload
+              ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
+              : `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
+          );

          // Record for transcript metadata — separate from `images` because
          // these are not passed inline to the model.
--- a/src/gateway/server-methods/chat.directive-tags.test.ts
+++ b/src/gateway/server-methods/chat.directive-tags.test.ts
@@ -2087,7 +2087,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
  });

-  it("drops image attachments for text-only session models", async () => {
+  it("offloads image attachments for text-only session models", async () => {
    createTranscriptFixture("openclaw-chat-send-text-only-attachments-");
    mockState.finalText = "ok";
    mockState.sessionEntry = {
@@ -2123,7 +2123,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    });

    expect(mockState.lastDispatchImages).toBeUndefined();
-    expect(mockState.lastDispatchImageOrder).toBeUndefined();
+    expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
+    expect(mockState.lastDispatchCtx?.Body).toMatch(
+      /^describe image\n\[media attached: media:\/\/inbound\//,
+    );
+    expect(mockState.savedMediaCalls).toEqual([
+      expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
+    ]);
  });

  it("keeps image attachments for text-only sessions bound to ACP", async () => {
@@ -2231,7 +2237,13 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    });

    expect(mockState.lastDispatchImages).toBeUndefined();
-    expect(mockState.lastDispatchImageOrder).toBeUndefined();
+    expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
+    expect(mockState.lastDispatchCtx?.Body).toMatch(
+      /^describe image\n\[media attached: media:\/\/inbound\//,
+    );
+    expect(mockState.savedMediaCalls).toEqual([
+      expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
+    ]);
  });

  it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {