[Feat] Gateway: offload non-image attachments on chat.send (#67572)

Merged via squash. Prepared head SHA: ecbd27fc30 Co-authored-by: samzong <13782141+samzong@users.noreply.github.com> Co-authored-by: frankekn <4488090+frankekn@users.noreply.github.com> Reviewed-by: @frankekn
2026-05-06 05:50:43 +00:00 · 2026-04-28 12:55:00 +08:00
parent a68cc94c36
commit 25ef9c0c41
16 changed files with 1119 additions and 369 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ Docs: https://docs.openclaw.ai

 ## Unreleased

+### Changes
+
+- Gateway/chat: accept non-image attachments through `chat.send` by staging them as agent-readable media paths, while keeping unsupported RPC attachment paths explicit instead of silently dropping files. Fixes #48123. (#67572) Thanks @samzong.
+
 ## 2026.4.27

 ### Changes
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -613,7 +613,11 @@ export async function getReplyFromConfig(
    }
  }

-  if (!useFastTestBootstrap && sessionKey && hasInboundMedia(ctx)) {
+  // ctx.MediaStaged=true means the caller (e.g. chat.send RPC) already staged
+  // synchronously so it could surface 5xx before respond(). Skipping here keeps
+  // staging a single-call contract instead of relying on relative-path no-op
+  // semantics in stageSandboxMedia.
+  if (!useFastTestBootstrap && sessionKey && !ctx.MediaStaged && hasInboundMedia(ctx)) {
    const { stageSandboxMedia } = await loadStageSandboxMediaRuntime();
    await stageSandboxMedia({
      ctx,
--- a/src/auto-reply/reply/stage-sandbox-media.ts
+++ b/src/auto-reply/reply/stage-sandbox-media.ts
@@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js";

 const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES;

+// `staged` maps every absolute source path that was copied into the sandbox
+// (or remote cache) to its rewritten ctx path. Callers like chat.send's
+// prestage use this to detect partial failures: unstaged sources keep their
+// original absolute path in ctx.MediaPaths, so a length check against the
+// input cannot distinguish "everything staged" from "silently skipped some"
+// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the
+// chat.send RPC already admitted under its 20MB cap).
+export type StageSandboxMediaResult = {
+  staged: ReadonlyMap<string, string>;
+};
+
+const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() };
+
 export async function stageSandboxMedia(params: {
  ctx: MsgContext;
  sessionCtx: TemplateContext;
  cfg: OpenClawConfig;
  sessionKey?: string;
  workspaceDir: string;
-}) {
+}): Promise<StageSandboxMediaResult> {
  const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params;
  const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0;
  const rawPaths = resolveRawPaths(ctx);
  if (rawPaths.length === 0 || !sessionKey) {
-    return;
+    return EMPTY_STAGE_RESULT;
  }

  const sandbox = await ensureSandboxWorkspaceForSession({
@@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: {
    : null;
  const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir;
  if (!effectiveWorkspaceDir) {
-    return;
+    return EMPTY_STAGE_RESULT;
  }

  await fs.mkdir(effectiveWorkspaceDir, { recursive: true });
@@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: {
    staged,
    hasPathsArray,
  });
+
+  return { staged };
 }

 async function stageLocalFileIntoRoot(params: {
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -123,8 +123,15 @@ export type MsgContext = {
  MediaPaths?: string[];
  MediaUrls?: string[];
  MediaTypes?: string[];
+  MediaWorkspaceDir?: string;
  /** Attachment indexes whose audio was already transcribed before media understanding runs. */
  MediaTranscribedIndexes?: number[];
+  /**
+   * Marker: skip downstream stageSandboxMedia. chat.send RPC sets this so
+   * staging runs synchronously before respond() and surfaces 5xx to the
+   * client; any later failure only reaches the broadcast channel.
+   */
+  MediaStaged?: boolean;
  /** Telegram sticker metadata (emoji, set name, file IDs, cached description). */
  Sticker?: StickerContextMetadata;
  /** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */
--- a/src/gateway/chat-attachments.test.ts
+++ b/src/gateway/chat-attachments.test.ts
@@ -1,26 +1,66 @@
-import { describe, expect, it, vi } from "vitest";
-import { deleteMediaBuffer } from "../media/store.js";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+const saveMediaBufferMock = vi.hoisted(() =>
+  vi.fn(async (_buffer: Buffer, mime?: string, _subdir?: string) => ({
+    id: `fake-id-${Math.random().toString(36).slice(2, 10)}`,
+    path: `/tmp/openclaw-test-media/inbound/fake.${mime?.split("/")[1] ?? "bin"}`,
+    size: 0,
+    contentType: mime,
+  })),
+);
+const deleteMediaBufferMock = vi.hoisted(() =>
+  vi.fn(async (_id: string, _subdir?: string) => undefined),
+);
+
+vi.mock("../media/store.js", async (importOriginal) => {
+  const actual = (await importOriginal()) as Record<string, unknown>;
+  return {
+    ...actual,
+    saveMediaBuffer: saveMediaBufferMock,
+    deleteMediaBuffer: deleteMediaBufferMock,
+  };
+});
+
+import type { OpenClawConfig } from "../config/types.openclaw.js";
+import { MAX_IMAGE_BYTES } from "../media/constants.js";
 import {
  buildMessageWithAttachments,
  type ChatAttachment,
+  DEFAULT_CHAT_ATTACHMENT_MAX_MB,
  parseMessageWithAttachments,
+  resolveChatAttachmentMaxBytes,
+  UnsupportedAttachmentError,
 } from "./chat-attachments.js";

 const PNG_1x1 =
  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";

-async function parseWithWarnings(message: string, attachments: ChatAttachment[]) {
+async function parseWithWarnings(
+  message: string,
+  attachments: ChatAttachment[],
+  opts: Parameters<typeof parseMessageWithAttachments>[2] = {},
+) {
  const logs: string[] = [];
  const parsed = await parseMessageWithAttachments(message, attachments, {
    log: { warn: (warning) => logs.push(warning) },
+    ...opts,
  });
  return { parsed, logs };
 }

 async function cleanupOffloadedRefs(refs: { id: string }[]) {
-  await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
+  await Promise.allSettled(refs.map((ref) => deleteMediaBufferMock(ref.id, "inbound")));
 }

+beforeEach(() => {
+  saveMediaBufferMock.mockClear();
+  deleteMediaBufferMock.mockClear();
+});
+
+afterEach(() => {
+  vi.restoreAllMocks();
+});
+
 describe("buildMessageWithAttachments", () => {
  it("embeds a single image as data URL", () => {
    const msg = buildMessageWithAttachments("see this", [
@@ -81,19 +121,43 @@ describe("parseMessageWithAttachments", () => {
    expect(logs).toHaveLength(0);
  });

-  it("drops non-image payloads and logs", async () => {
-    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
-    const { parsed, logs } = await parseWithWarnings("x", [
+  it("accepts non-image payloads and offloads them via the media store", async () => {
+    const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
+    const { parsed, logs } = await parseWithWarnings("read this", [
      {
        type: "file",
-        mimeType: "image/png",
-        fileName: "not-image.pdf",
+        mimeType: "application/pdf",
+        fileName: "report.pdf",
        content: pdf,
      },
    ]);
    expect(parsed.images).toHaveLength(0);
-    expect(logs).toHaveLength(1);
-    expect(logs[0]).toMatch(/non-image/i);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    const ref = parsed.offloadedRefs[0];
+    expect(ref.mimeType).toBe("application/pdf");
+    expect(ref.label).toBe("report.pdf");
+    expect(ref.mediaRef).toMatch(/^media:\/\/inbound\//);
+    // Non-image offloads MUST NOT inject a media://URI into the message —
+    // the caller is responsible for routing offloadedRefs[].path into
+    // ctx.MediaPaths so the workspace stage surfaces a real path.
+    expect(parsed.message).toBe("read this");
+    expect(saveMediaBufferMock).toHaveBeenCalledOnce();
+    expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/pdf");
+    expect(logs).toHaveLength(0);
+  });
+
+  it("offloads opaque binary when sniff and provided mime are both absent", async () => {
+    const unknown = Buffer.from("just some bytes that do not match any signature").toString(
+      "base64",
+    );
+    const { parsed, logs } = await parseWithWarnings("take a look", [
+      { type: "file", fileName: "blob.dat", content: unknown },
+    ]);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/octet-stream");
+    expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/octet-stream");
+    expect(parsed.message).toBe("take a look");
+    expect(logs).toHaveLength(0);
  });

  it("prefers sniffed mime type and logs mismatch", async () => {
@@ -111,28 +175,9 @@ describe("parseMessageWithAttachments", () => {
    expect(logs[0]).toMatch(/mime mismatch/i);
  });

-  it("persists unknown non-image files when sniff fails", async () => {
-    const unknown = Buffer.from("not an image").toString("base64");
-    const { parsed, logs } = await parseWithWarnings("x", [
-      { type: "file", fileName: "unknown.bin", content: unknown },
-    ]);
-    try {
-      expect(parsed.images).toHaveLength(0);
-      expect(parsed.offloadedRefs).toHaveLength(1);
-      expect(parsed.offloadedRefs[0]).toMatchObject({
-        label: "unknown.bin",
-        mimeType: "application/octet-stream",
-      });
-      expect(parsed.message).toMatch(/^x\n\[media attached: media:\/\/inbound\//);
-      expect(logs).toHaveLength(0);
-    } finally {
-      await cleanupOffloadedRefs(parsed.offloadedRefs);
-    }
-  });
-
-  it("keeps valid images and drops invalid ones", async () => {
+  it("keeps image inline and offloads non-image side by side", async () => {
    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
-    const { parsed, logs } = await parseWithWarnings("x", [
+    const { parsed } = await parseWithWarnings("x", [
      {
        type: "image",
        mimeType: "image/png",
@@ -141,15 +186,196 @@ describe("parseMessageWithAttachments", () => {
      },
      {
        type: "file",
-        mimeType: "image/png",
-        fileName: "not-image.pdf",
+        mimeType: "application/pdf",
+        fileName: "report.pdf",
        content: pdf,
      },
    ]);
    expect(parsed.images).toHaveLength(1);
    expect(parsed.images[0]?.mimeType).toBe("image/png");
-    expect(parsed.images[0]?.data).toBe(PNG_1x1);
-    expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
+    expect(parsed.imageOrder).toEqual(["inline"]);
+  });
+
+  it("excludes non-image offloads from imageOrder in mixed batches", async () => {
+    // Regression: a prior revision pushed "offloaded" for every offload,
+    // including non-image files. In a [non-image, inline, offloaded-image]
+    // batch that produced imageOrder=["offloaded","inline","offloaded"] even
+    // though only one `[media attached: media://...]` line is ever appended
+    // to the prompt (for the image offload). extractTrailingAttachmentMediaUris
+    // then read count=2 against one trailing URI, and
+    // mergePromptAttachmentImages placed the single offloaded image into the
+    // first "offloaded" slot — swapping it ahead of the inline image.
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+    const bigPng = Buffer.alloc(2_100_000);
+    bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0);
+    const { parsed } = await parseWithWarnings("x", [
+      { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
+      { type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 },
+      {
+        type: "image",
+        mimeType: "image/png",
+        fileName: "big.png",
+        content: bigPng.toString("base64"),
+      },
+    ]);
+    expect(parsed.images).toHaveLength(1);
+    expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([
+      "application/pdf",
+      "image/png",
+    ]);
+    expect(parsed.imageOrder).toEqual(["inline", "offloaded"]);
+    // The offloaded-image URI is the sole trailing media:// line, matching
+    // imageOrder's single "offloaded" slot.
+    const trailingMediaLines = parsed.message
+      .split("\n")
+      .filter((line) => line.trim().startsWith("[media attached: media://inbound/"));
+    expect(trailingMediaLines).toHaveLength(1);
+  });
+
+  it("rejects oversized images before offload", async () => {
+    const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64");
+
+    await expect(
+      parseMessageWithAttachments(
+        "x",
+        [{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }],
+        { maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } },
+      ),
+    ).rejects.toThrow(/image exceeds size limit/i);
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
+  });
+
+  it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => {
+    const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64");
+    const { parsed } = await parseWithWarnings("x", [
+      {
+        type: "file",
+        mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        fileName: "spec.docx",
+        content: docx,
+      },
+    ]);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.label).toBe("spec.docx");
+    // Docx sniffs as application/zip; the provided OOXML mime must win so the
+    // agent sees the real document type, not a generic archive.
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe(
+      "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    );
+  });
+
+  it("recovers specific mime from filename extension when sniff is generic and provided mime is absent", async () => {
+    const xlsx = Buffer.from("PK\u0003\u0004fake-xlsx").toString("base64");
+    const { parsed } = await parseWithWarnings("x", [
+      { type: "file", fileName: "sheet.xlsx", content: xlsx },
+    ]);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe(
+      "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    );
+  });
+
+  it("accepts zip attachments via workspace offload", async () => {
+    const zip = Buffer.from("PK\u0003\u0004zip-archive-bytes").toString("base64");
+    const { parsed } = await parseWithWarnings("x", [
+      {
+        type: "file",
+        mimeType: "application/zip",
+        fileName: "bundle.zip",
+        content: zip,
+      },
+    ]);
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.label).toBe("bundle.zip");
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/zip");
+  });
+});
+
+describe("parseMessageWithAttachments validation errors", () => {
+  it("throws UnsupportedAttachmentError on empty payload", async () => {
+    await expect(
+      parseMessageWithAttachments(
+        "x",
+        [{ type: "file", mimeType: "application/pdf", fileName: "empty.pdf", content: "" }],
+        { log: { warn: () => {} } },
+      ),
+    ).rejects.toMatchObject({
+      name: "UnsupportedAttachmentError",
+      reason: "empty-payload",
+    });
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
+  });
+
+  it("throws UnsupportedAttachmentError on non-image when acceptNonImage is false", async () => {
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+    let caught: unknown;
+    try {
+      await parseMessageWithAttachments(
+        "x",
+        [{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
+        { log: { warn: () => {} }, acceptNonImage: false },
+      );
+    } catch (err) {
+      caught = err;
+    }
+    expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
+    expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image");
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
+  });
+
+  it("rejects generic-container payloads mislabeled as images when acceptNonImage is false", async () => {
+    const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64");
+    let caught: unknown;
+    try {
+      await parseMessageWithAttachments(
+        "x",
+        [{ type: "file", mimeType: "image/png", fileName: "report.docx", content: docx }],
+        { log: { warn: () => {} }, acceptNonImage: false },
+      );
+    } catch (err) {
+      caught = err;
+    }
+    expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
+    expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image");
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
+  });
+
+  it("throws UnsupportedAttachmentError on image when supportsInlineImages is false", async () => {
+    let caught: unknown;
+    try {
+      await parseMessageWithAttachments(
+        "x",
+        [{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }],
+        { log: { warn: () => {} }, supportsInlineImages: false },
+      );
+    } catch (err) {
+      caught = err;
+    }
+    expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
+    expect((caught as UnsupportedAttachmentError).reason).toBe("text-only-image");
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
+  });
+
+  it("still offloads non-image attachments when supportsInlineImages is false", async () => {
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+    const { parsed } = await parseWithWarnings(
+      "x",
+      [{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
+      { supportsInlineImages: false },
+    );
+    expect(parsed.offloadedRefs).toHaveLength(1);
+    expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
+    expect(saveMediaBufferMock).toHaveBeenCalledOnce();
+  });
+
+  it("passes through unchanged on text-only session with no attachments", async () => {
+    const { parsed } = await parseWithWarnings("hello", [], { supportsInlineImages: false });
+    expect(parsed.message).toBe("hello");
+    expect(parsed.images).toHaveLength(0);
+    expect(parsed.offloadedRefs).toHaveLength(0);
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
  });

  it("persists non-image file attachments as media refs", async () => {
@@ -168,13 +394,13 @@ describe("parseMessageWithAttachments", () => {

    try {
      expect(parsed.images).toHaveLength(0);
-      expect(parsed.imageOrder).toEqual(["offloaded"]);
+      expect(parsed.imageOrder).toEqual([]);
      expect(parsed.offloadedRefs).toHaveLength(1);
      expect(parsed.offloadedRefs[0]).toMatchObject({
        mimeType: "application/pdf",
        label: "brief.pdf",
      });
-      expect(parsed.message).toMatch(/^read this\n\[media attached: media:\/\/inbound\//);
+      expect(parsed.message).toBe("read this");
    } finally {
      await cleanupOffloadedRefs(parsed.offloadedRefs);
    }
@@ -258,6 +484,32 @@ describe("parseMessageWithAttachments", () => {
  });
 });

+describe("resolveChatAttachmentMaxBytes", () => {
+  const MB = 1024 * 1024;
+  const DEFAULT_BYTES = DEFAULT_CHAT_ATTACHMENT_MAX_MB * MB;
+
+  const cfgWithMediaMaxMb = (value: unknown): OpenClawConfig =>
+    ({ agents: { defaults: { mediaMaxMb: value } } }) as unknown as OpenClawConfig;
+
+  it("honours a configured agents.defaults.mediaMaxMb", () => {
+    expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(10))).toBe(10 * MB);
+    expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(50))).toBe(50 * MB);
+  });
+
+  it("falls back to DEFAULT_CHAT_ATTACHMENT_MAX_MB when unset", () => {
+    expect(resolveChatAttachmentMaxBytes({} as OpenClawConfig)).toBe(DEFAULT_BYTES);
+    expect(resolveChatAttachmentMaxBytes({ agents: {} } as unknown as OpenClawConfig)).toBe(
+      DEFAULT_BYTES,
+    );
+  });
+
+  it("rejects non-positive, non-finite, or non-number values", () => {
+    for (const bad of [0, -5, Number.NaN, Number.POSITIVE_INFINITY, "50", null, undefined]) {
+      expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(bad))).toBe(DEFAULT_BYTES);
+    }
+  });
+});
+
 describe("shared attachment validation", () => {
  it("rejects invalid base64 content for both builder and parser", async () => {
    const bad: ChatAttachment = {
--- a/src/gateway/chat-attachments.ts
+++ b/src/gateway/chat-attachments.ts
@@ -1,12 +1,12 @@
+import type { OpenClawConfig } from "../config/types.openclaw.js";
 import { formatErrorMessage } from "../infra/errors.js";
 import { estimateBase64DecodedBytes } from "../media/base64.js";
+import { MAX_IMAGE_BYTES } from "../media/constants.js";
+import { extensionForMime, mimeTypeFromFilePath } from "../media/mime.js";
 import type { PromptImageOrderEntry } from "../media/prompt-image-order.js";
 import { sniffMimeFromBase64 } from "../media/sniff-mime-from-base64.js";
 import { deleteMediaBuffer, saveMediaBuffer } from "../media/store.js";
-import {
-  normalizeLowercaseStringOrEmpty,
-  normalizeOptionalLowercaseString,
-} from "../shared/string-coerce.js";
+import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";

 export type ChatAttachment = {
  type?: string;
@@ -21,47 +21,19 @@ export type ChatImageContent = {
  mimeType: string;
 };

-/**
- * Metadata for an attachment that was offloaded to the media store.
- *
- * Included in ParsedMessageWithImages.offloadedRefs so that callers can
- * persist structured media metadata for transcripts. Without this, consumers
- * that derive MediaPath/MediaPaths from the `images` array (e.g.
- * persistChatSendImages and buildChatSendTranscriptMessage in chat.ts) would
- * silently omit all large attachments that were offloaded to disk.
- */
 export type OffloadedRef = {
-  /** Opaque media URI injected into the message, e.g. "media://inbound/<id>" */
  mediaRef: string;
-  /** The raw media ID from SavedMedia.id, usable with resolveMediaBufferPath */
  id: string;
-  /** Absolute filesystem path returned by saveMediaBuffer — used for transcript MediaPath */
  path: string;
-  /** MIME type of the offloaded attachment */
  mimeType: string;
-  /** The label / filename of the original attachment */
  label: string;
+  sizeBytes: number;
 };

 export type ParsedMessageWithImages = {
  message: string;
-  /** Small attachments (≤ OFFLOAD_THRESHOLD_BYTES) passed inline to the model */
  images: ChatImageContent[];
-  /** Original accepted attachment order after inline/offloaded split. */
  imageOrder: PromptImageOrderEntry[];
-  /**
-   * Large attachments (> OFFLOAD_THRESHOLD_BYTES) that were offloaded to the
-   * media store. Each entry corresponds to a `[media attached: media://inbound/<id>]`
-   * marker appended to `message`.
-   *
-   * Callers MUST persist this list separately for transcript media metadata.
-   * It is intentionally separate from `images` because downstream model calls
-   * do not receive these as inline image blocks.
-   *
-   * ⚠️  Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
-   * `supportsImages: modelSupportsImages(model)` so text-only model runs
-   * offload images as media refs instead of passing inline image blocks.
-   */
  offloadedRefs: OffloadedRef[];
 };

@@ -78,49 +50,38 @@ type NormalizedAttachment = {

 type SavedMedia = {
  id: string;
-  path?: string;
+  path: string;
 };

 const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
 const TEXT_ONLY_OFFLOAD_LIMIT = 10;

-const MIME_TO_EXT: Record<string, string> = {
-  "image/jpeg": ".jpg",
-  "image/jpg": ".jpg",
-  "image/png": ".png",
-  "image/webp": ".webp",
-  "image/gif": ".gif",
-  "image/heic": ".heic",
-  "image/heif": ".heif",
-  // bmp/tiff excluded from SUPPORTED_OFFLOAD_MIMES to avoid extension-loss
-  // bug in store.ts; entries kept here for future extension support
-  "image/bmp": ".bmp",
-  "image/tiff": ".tiff",
-};
+export const DEFAULT_CHAT_ATTACHMENT_MAX_MB = 20;

-// Module-level Set for O(1) lookup — not rebuilt on every attachment iteration.
-//
-// heic/heif are included only if store.ts's extensionForMime maps them to an
-// extension. If it does not (same extension-loss risk as bmp/tiff), remove
-// them from this set.
-const SUPPORTED_OFFLOAD_MIMES = new Set([
-  "image/jpeg",
-  "image/jpg",
-  "image/png",
-  "image/webp",
-  "image/gif",
-  "image/heic",
-  "image/heif",
-]);
+export function resolveChatAttachmentMaxBytes(cfg: OpenClawConfig): number {
+  const configured = cfg.agents?.defaults?.mediaMaxMb;
+  const mb =
+    typeof configured === "number" && Number.isFinite(configured) && configured > 0
+      ? configured
+      : DEFAULT_CHAT_ATTACHMENT_MAX_MB;
+  return Math.floor(mb * 1024 * 1024);
+}
+
+export type UnsupportedAttachmentReason =
+  | "empty-payload"
+  | "text-only-image"
+  | "unsupported-non-image"
+  | "non-image-too-large-for-sandbox";
+
+export class UnsupportedAttachmentError extends Error {
+  readonly reason: UnsupportedAttachmentReason;
+  constructor(reason: UnsupportedAttachmentReason, message: string) {
+    super(message);
+    this.name = "UnsupportedAttachmentError";
+    this.reason = reason;
+  }
+}

-/**
- * Raised when the Gateway cannot persist an attachment to the media store.
- *
- * Distinct from ordinary input-validation errors so that Gateway handlers can
- * map it to a server-side 5xx status rather than a client 4xx.
- *
- * Example causes: ENOSPC, EPERM, unexpected saveMediaBuffer return shape.
- */
 export class MediaOffloadError extends Error {
  readonly cause: unknown;
  constructor(message: string, options?: ErrorOptions) {
@@ -142,42 +103,24 @@ function isImageMime(mime?: string): boolean {
  return typeof mime === "string" && mime.startsWith("image/");
 }

-function isVideoMime(mime?: string): boolean {
-  return typeof mime === "string" && mime.startsWith("video/");
+function isGenericContainerMime(mime?: string): boolean {
+  return mime === "application/zip" || mime === "application/octet-stream";
 }

-function isGenericMime(mime?: string): boolean {
-  return (
-    !mime ||
-    mime === "application/octet-stream" ||
-    mime === "binary/octet-stream" ||
-    mime === "application/unknown"
-  );
+function shouldIgnoreProvidedImageMime(params: {
+  sniffedMime?: string;
+  providedMime?: string;
+}): boolean {
+  return isGenericContainerMime(params.sniffedMime) && isImageMime(params.providedMime);
 }

 function isValidBase64(value: string): boolean {
  if (value.length === 0 || value.length % 4 !== 0) {
    return false;
  }
-  // A full O(n) regex scan is safe: no overlapping quantifiers, fails linearly.
-  // Prevents adversarial payloads padded with megabytes of whitespace from
-  // bypassing length thresholds.
  return /^[A-Za-z0-9+/]+={0,2}$/.test(value);
 }

-/**
- * Confirms that the decoded buffer produced by Buffer.from(b64, 'base64')
- * matches the pre-decode size estimate.
- *
- * Node's Buffer.from silently drops invalid base64 characters rather than
- * throwing. A material size discrepancy means the source string contained
- * embedded garbage that was silently stripped, which would produce a corrupted
- * file on disk. ±3 bytes of leeway accounts for base64 padding rounding.
- *
- * IMPORTANT: this is an input-validation check (4xx client error).
- * It MUST be called OUTSIDE the MediaOffloadError try/catch so that
- * corrupt-input errors are not misclassified as 5xx server errors.
- */
 function verifyDecodedSize(buffer: Buffer, estimatedBytes: number, label: string): void {
  if (Math.abs(buffer.byteLength - estimatedBytes) > 3) {
    throw new Error(
@@ -191,40 +134,34 @@ function ensureExtension(label: string, mime: string): string {
  if (/\.[a-zA-Z0-9]+$/.test(label)) {
    return label;
  }
-  const ext = MIME_TO_EXT[normalizeLowercaseStringOrEmpty(mime)] ?? "";
+  const ext = extensionForMime(mime) ?? "";
  return ext ? `${label}${ext}` : label;
 }

-/**
- * Type guard for the return value of saveMediaBuffer.
- *
- * Also validates that the returned ID:
- * - is a non-empty string
- * - contains no path separators (/ or \) or null bytes
- *
- * Catching a bad shape here produces a cleaner error than a cryptic failure
- * deeper in the stack, and is treated as a 5xx infrastructure error.
- */
 function assertSavedMedia(value: unknown, label: string): SavedMedia {
  if (
-    value !== null &&
-    typeof value === "object" &&
-    "id" in value &&
-    typeof (value as Record<string, unknown>).id === "string"
+    value === null ||
+    typeof value !== "object" ||
+    !("id" in value) ||
+    typeof (value as Record<string, unknown>).id !== "string"
  ) {
-    const id = (value as Record<string, unknown>).id as string;
-    if (id.length === 0) {
-      throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
-    }
-    if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
-      throw new Error(
-        `attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
-          `(contains path separator or null byte)`,
-      );
-    }
-    return value as SavedMedia;
+    throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
  }
-  throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
+  const id = (value as Record<string, unknown>).id as string;
+  if (id.length === 0) {
+    throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
+  }
+  if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
+    throw new Error(
+      `attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
+        `(contains path separator or null byte)`,
+    );
+  }
+  const path = (value as Record<string, unknown>).path;
+  if (typeof path !== "string" || path.length === 0) {
+    throw new Error(`attachment ${label}: saveMediaBuffer returned no on-disk path`);
+  }
+  return { id, path };
 }

 function normalizeAttachment(
@@ -269,47 +206,22 @@ function validateAttachmentBase64OrThrow(
  return sizeBytes;
 }

-/**
- * Parse attachments and extract images as structured content blocks.
- * Returns the message text, inline image blocks, and offloaded media refs.
- *
- * ## Offload behaviour
- * Attachments whose decoded size exceeds OFFLOAD_THRESHOLD_BYTES are saved to
- * disk via saveMediaBuffer and replaced with an opaque `media://inbound/<id>`
- * URI appended to the message. The agent resolves these URIs via
- * resolveMediaBufferPath before passing them to the model.
- *
- * ## Transcript metadata
- * Callers MUST use `result.offloadedRefs` to persist structured media metadata
- * for transcripts. These refs are intentionally excluded from `result.images`
- * because they are not passed inline to the model.
- *
- * ## Text-only model runs
- * Pass `supportsImages: false` for text-only model runs so images are offloaded
- * as `media://inbound/<id>` refs instead of being sent as inline image blocks.
- * The agent runner can then resolve the refs through the normal media path.
- *
- * ## Cleanup on failure
- * On any parse failure after files have already been offloaded, best-effort
- * cleanup is performed before rethrowing so that malformed requests do not
- * accumulate orphaned files on disk ahead of the periodic TTL sweep.
- *
- * ## Known ordering limitation
- * In mixed large/small batches, the model receives images in a different order
- * than the original attachment list because detectAndLoadPromptImages
- * initialises from existingImages first, then appends prompt-detected refs.
- * A future refactor should unify all image references into a single ordered list.
- *
- * @throws {MediaOffloadError} Infrastructure failure saving to media store → 5xx.
- * @throws {Error} Input validation failure → 4xx.
- */
 export async function parseMessageWithAttachments(
  message: string,
  attachments: ChatAttachment[] | undefined,
-  opts?: { maxBytes?: number; log?: AttachmentLog; supportsImages?: boolean },
+  opts?: {
+    maxBytes?: number;
+    log?: AttachmentLog;
+    supportsImages?: boolean;
+    supportsInlineImages?: boolean;
+    acceptNonImage?: boolean;
+  },
 ): Promise<ParsedMessageWithImages> {
-  const maxBytes = opts?.maxBytes ?? 5_000_000;
+  const maxBytes = opts?.maxBytes ?? DEFAULT_CHAT_ATTACHMENT_MAX_MB * 1024 * 1024;
  const log = opts?.log;
+  const shouldForceImageOffload = opts?.supportsImages === false;
+  const supportsInlineImages = opts?.supportsInlineImages !== false;
+  const acceptNonImage = opts?.acceptNonImage !== false;

  if (!attachments || attachments.length === 0) {
    return { message, images: [], imageOrder: [], offloadedRefs: [] };
@@ -319,11 +231,7 @@ export async function parseMessageWithAttachments(
  const imageOrder: PromptImageOrderEntry[] = [];
  const offloadedRefs: OffloadedRef[] = [];
  let updatedMessage = message;
-  const shouldForceOffload = opts?.supportsImages === false;
  let textOnlyImageOffloadCount = 0;
-
-  // Track IDs of files saved during this request for cleanup if a later
-  // attachment fails validation and the entire parse is aborted.
  const savedMediaIds: string[] = [];

  try {
@@ -339,16 +247,14 @@ export async function parseMessageWithAttachments(

      const { base64: b64, label, mime } = normalized;

+      if (b64.length === 0) {
+        throw new UnsupportedAttachmentError("empty-payload", `attachment ${label}: empty payload`);
+      }
      if (!isValidBase64(b64)) {
        throw new Error(`attachment ${label}: invalid base64 content`);
      }

      const sizeBytes = estimateBase64DecodedBytes(b64);
-      if (sizeBytes <= 0) {
-        log?.warn(`attachment ${label}: estimated size is zero, dropping`);
-        continue;
-      }
-
      if (sizeBytes > maxBytes) {
        throw new Error(
          `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`,
@@ -357,67 +263,74 @@ export async function parseMessageWithAttachments(

      const providedMime = normalizeMime(mime);
      const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64));
+      const labelMime = normalizeMime(mimeTypeFromFilePath(label));
+      const trustedProvidedMime = shouldIgnoreProvidedImageMime({ sniffedMime, providedMime })
+        ? undefined
+        : providedMime;

-      if (sniffedMime && !isImageMime(sniffedMime) && isImageMime(providedMime)) {
-        log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`);
-        continue;
-      }
+      // Prefer specific MIME signals over generic container types. OOXML
+      // documents (docx/xlsx/pptx) sniff as application/zip; without this
+      // priority the agent would receive a `.zip` instead of the specific
+      // Office document the caller declared.
+      const finalMime =
+        (sniffedMime && !isGenericContainerMime(sniffedMime) && sniffedMime) ||
+        (trustedProvidedMime &&
+          !isGenericContainerMime(trustedProvidedMime) &&
+          trustedProvidedMime) ||
+        (labelMime && !isGenericContainerMime(labelMime) && labelMime) ||
+        sniffedMime ||
+        trustedProvidedMime ||
+        labelMime ||
+        "application/octet-stream";

-      const shouldHandleAsImage =
-        isImageMime(sniffedMime) || (isImageMime(providedMime) && !sniffedMime);
-      if (!shouldHandleAsImage) {
-        const finalMime = sniffedMime ?? providedMime ?? "application/octet-stream";
-        if (isVideoMime(finalMime)) {
-          log?.warn(`attachment ${label}: video attachments are not supported, dropping`);
-          continue;
-        }
-
-        const buffer = Buffer.from(b64, "base64");
-        verifyDecodedSize(buffer, sizeBytes, label);
-
-        try {
-          const rawResult = await saveMediaBuffer(buffer, finalMime, "inbound", maxBytes, label);
-          const savedMedia = assertSavedMedia(rawResult, label);
-          savedMediaIds.push(savedMedia.id);
-
-          const mediaRef = `media://inbound/${savedMedia.id}`;
-          updatedMessage += `\n[media attached: ${mediaRef}]`;
-          log?.info?.(`[Gateway] Saved file attachment. Saved: ${mediaRef}`);
-          offloadedRefs.push({
-            mediaRef,
-            id: savedMedia.id,
-            path: savedMedia.path ?? "",
-            mimeType: finalMime,
-            label,
-          });
-          imageOrder.push("offloaded");
-        } catch (err) {
-          const errorMessage = formatErrorMessage(err);
-          throw new MediaOffloadError(
-            `[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
-            { cause: err },
-          );
-        }
-        continue;
-      }
      if (
        sniffedMime &&
        providedMime &&
-        !isGenericMime(providedMime) &&
+        !isGenericContainerMime(providedMime) &&
        sniffedMime !== providedMime
      ) {
+        const usedSource =
+          finalMime === sniffedMime
+            ? "sniffed"
+            : finalMime === providedMime
+              ? "provided"
+              : "label-derived";
        log?.warn(
-          `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`,
+          `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using ${usedSource}`,
        );
      }

-      // Third fallback normalises `mime` so a raw un-normalised string (e.g.
-      // "IMAGE/JPEG") does not silently bypass the SUPPORTED_OFFLOAD_MIMES check.
-      const finalMime = sniffedMime ?? providedMime ?? normalizeMime(mime) ?? mime;
+      const isImage = isImageMime(finalMime);
+      if (isImage && !supportsInlineImages && !shouldForceImageOffload) {
+        throw new UnsupportedAttachmentError(
+          "text-only-image",
+          `attachment ${label}: active model does not accept image inputs`,
+        );
+      }
+      if (!isImage && !acceptNonImage) {
+        throw new UnsupportedAttachmentError(
+          "unsupported-non-image",
+          `attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`,
+        );
+      }
+      // Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF
+      // direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that
+      // would offload a file the runner later drops to null — a successful
+      // response with a silently missing image. Reject here so the client
+      // sees an explicit 4xx. Non-image attachments keep the full maxBytes
+      // ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't
+      // load into the model.
+      if (isImage && sizeBytes > MAX_IMAGE_BYTES) {
+        throw new Error(
+          `attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`,
+        );
+      }

-      let isOffloaded = false;
-
-      if (shouldForceOffload && textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT) {
+      if (
+        shouldForceImageOffload &&
+        isImage &&
+        textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT
+      ) {
        log?.warn(
          `attachment ${label}: dropping image because text-only offload limit ` +
            `${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
@@ -426,93 +339,64 @@ export async function parseMessageWithAttachments(
        continue;
      }

-      if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
-        const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);
+      const shouldOffload =
+        shouldForceImageOffload || !isImage || sizeBytes > OFFLOAD_THRESHOLD_BYTES;

-        if (!isSupportedForOffload) {
-          if (shouldForceOffload) {
-            log?.warn(
-              `attachment ${label}: format ${finalMime} cannot be offloaded for ` +
-                "text-only model, dropping",
-            );
-            continue;
-          }
-          // Passing this inline would reintroduce the OOM risk this PR prevents.
-          throw new Error(
-            `attachment ${label}: format ${finalMime} is too large to pass inline ` +
-              `(${sizeBytes} > ${OFFLOAD_THRESHOLD_BYTES} bytes) and cannot be offloaded. ` +
-              `Please convert to JPEG, PNG, WEBP, GIF, HEIC, or HEIF.`,
-          );
-        }
-
-        // Decode and run input-validation BEFORE the MediaOffloadError try/catch.
-        // verifyDecodedSize is a 4xx client error and must not be wrapped as a
-        // 5xx MediaOffloadError.
-        const buffer = Buffer.from(b64, "base64");
-        verifyDecodedSize(buffer, sizeBytes, label);
-
-        // Only the storage operation is wrapped so callers can distinguish
-        // infrastructure failures (5xx) from input errors (4xx).
-        try {
-          const labelWithExt = ensureExtension(label, finalMime);
-
-          const rawResult = await saveMediaBuffer(
-            buffer,
-            finalMime,
-            "inbound",
-            maxBytes,
-            labelWithExt,
-          );
-
-          const savedMedia = assertSavedMedia(rawResult, label);
-
-          // Track for cleanup if a subsequent attachment fails.
-          savedMediaIds.push(savedMedia.id);
-
-          // Opaque URI — compatible with workspaceOnly sandboxes and decouples
-          // the Gateway from the agent's filesystem layout.
-          const mediaRef = `media://inbound/${savedMedia.id}`;
-
-          updatedMessage += `\n[media attached: ${mediaRef}]`;
-          log?.info?.(
-            shouldForceOffload
-              ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
-              : `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
-          );
-
-          // Record for transcript metadata — separate from `images` because
-          // these are not passed inline to the model.
-          offloadedRefs.push({
-            mediaRef,
-            id: savedMedia.id,
-            path: savedMedia.path ?? "",
-            mimeType: finalMime,
-            label,
-          });
-          imageOrder.push("offloaded");
-          if (shouldForceOffload) {
-            textOnlyImageOffloadCount++;
-          }
-
-          isOffloaded = true;
-        } catch (err) {
-          const errorMessage = formatErrorMessage(err);
-          throw new MediaOffloadError(
-            `[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
-            { cause: err },
-          );
-        }
-      }
-
-      if (isOffloaded) {
+      if (!shouldOffload) {
+        images.push({ type: "image", data: b64, mimeType: finalMime });
+        imageOrder.push("inline");
        continue;
      }

-      images.push({ type: "image", data: b64, mimeType: finalMime });
-      imageOrder.push("inline");
+      const buffer = Buffer.from(b64, "base64");
+      verifyDecodedSize(buffer, sizeBytes, label);
+
+      let savedMedia: SavedMedia;
+      try {
+        const labelWithExt = ensureExtension(label, finalMime);
+        const rawResult = await saveMediaBuffer(
+          buffer,
+          finalMime,
+          "inbound",
+          maxBytes,
+          labelWithExt,
+        );
+        savedMedia = assertSavedMedia(rawResult, label);
+      } catch (err) {
+        throw new MediaOffloadError(
+          `[Gateway Error] Failed to save intercepted media to disk: ${formatErrorMessage(err)}`,
+          { cause: err },
+        );
+      }
+
+      savedMediaIds.push(savedMedia.id);
+
+      const mediaRef = `media://inbound/${savedMedia.id}`;
+      if (isImage) {
+        updatedMessage += `\n[media attached: ${mediaRef}]`;
+      }
+      log?.info?.(
+        shouldForceImageOffload && isImage
+          ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
+          : `[Gateway] Offloaded attachment (${finalMime}). Saved: ${mediaRef}`,
+      );
+
+      offloadedRefs.push({
+        mediaRef,
+        id: savedMedia.id,
+        path: savedMedia.path,
+        mimeType: finalMime,
+        label,
+        sizeBytes,
+      });
+      if (isImage) {
+        imageOrder.push("offloaded");
+        if (shouldForceImageOffload) {
+          textOnlyImageOffloadCount++;
+        }
+      }
    }
  } catch (err) {
-    // Best-effort cleanup before rethrowing.
    if (savedMediaIds.length > 0) {
      await Promise.allSettled(savedMediaIds.map((id) => deleteMediaBuffer(id, "inbound")));
    }
--- a/src/gateway/server-methods/agent.ts
+++ b/src/gateway/server-methods/agent.ts
@@ -79,7 +79,11 @@ import {
 } from "../../utils/message-channel.js";
 import { resolveAssistantIdentity } from "../assistant-identity.js";
 import { registerChatAbortController, resolveAgentRunExpiresAtMs } from "../chat-abort.js";
-import { MediaOffloadError, parseMessageWithAttachments } from "../chat-attachments.js";
+import {
+  MediaOffloadError,
+  parseMessageWithAttachments,
+  resolveChatAttachmentMaxBytes,
+} from "../chat-attachments.js";
 import { resolveAssistantAvatarUrl } from "../control-ui-shared.js";
 import { ADMIN_SCOPE } from "../method-scopes.js";
 import { GATEWAY_CLIENT_CAPS, hasGatewayClientCap } from "../protocol/client-info.js";
@@ -498,7 +502,7 @@ export const agentHandlers: GatewayRequestHandlers = {
      }
      const effectiveProvider = providerOverride || baseProvider;
      const effectiveModel = modelOverride || baseModel;
-      const supportsImages = await resolveGatewayModelSupportsImages({
+      const supportsInlineImages = await resolveGatewayModelSupportsImages({
        loadGatewayModelCatalog: context.loadGatewayModelCatalog,
        provider: effectiveProvider,
        model: effectiveModel,
@@ -506,9 +510,13 @@ export const agentHandlers: GatewayRequestHandlers = {

      try {
        const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
-          maxBytes: 5_000_000,
+          maxBytes: resolveChatAttachmentMaxBytes(cfg),
          log: context.logGateway,
-          supportsImages,
+          supportsInlineImages,
+          // agent.run does not yet wire a ctx.MediaPaths stage path, so reject
+          // non-image attachments explicitly (UnsupportedAttachmentError)
+          // instead of saving them where the agent cannot reach them.
+          acceptNonImage: false,
        });
        message = parsed.message.trim();
        images = parsed.images;
--- a/src/gateway/server-methods/chat.directive-tags.test.ts
+++ b/src/gateway/server-methods/chat.directive-tags.test.ts
@@ -66,6 +66,15 @@ const mockState = vi.hoisted(() => ({
  saveMediaWait: null as Promise<void> | null,
  activeSaveMediaCalls: 0,
  maxActiveSaveMediaCalls: 0,
+  sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null,
+  stageSandboxMediaError: null as Error | null,
+  stagedRelativePaths: null as string[] | null,
+  // `unstagedSources` lets tests simulate partial staging failure: absolute
+  // source paths listed here are excluded from the returned `staged` map even
+  // though ctx still carries their rewritten paths. This mirrors how the real
+  // stageSandboxMedia silently skips over-cap files.
+  unstagedSources: null as string[] | null,
+  deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>,
 }));

 const bindingMocks = vi.hoisted(() => ({
@@ -216,11 +225,54 @@ vi.mock("../../sessions/transcript-events.js", () => ({
  ),
 }));

+vi.mock("../../agents/sandbox/context.js", async () => {
+  const original = await vi.importActual<typeof import("../../agents/sandbox/context.js")>(
+    "../../agents/sandbox/context.js",
+  );
+  return {
+    ...original,
+    ensureSandboxWorkspaceForSession: vi.fn(async () => mockState.sandboxWorkspace),
+  };
+});
+
+vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({
+  stageSandboxMedia: vi.fn(
+    async (params: { ctx: { MediaPaths?: string[]; MediaPath?: string } }) => {
+      if (mockState.stageSandboxMediaError) {
+        throw mockState.stageSandboxMediaError;
+      }
+      const staged = new Map<string, string>();
+      const originalPaths = params.ctx.MediaPaths ?? [];
+      if (mockState.stagedRelativePaths) {
+        const mapping = mockState.stagedRelativePaths;
+        params.ctx.MediaPaths = [...mapping];
+        params.ctx.MediaPath = mapping[0];
+        for (let i = 0; i < mapping.length; i += 1) {
+          const source = originalPaths[i];
+          const dest = mapping[i];
+          if (source && dest) {
+            staged.set(source, dest);
+          }
+        }
+      }
+      if (mockState.unstagedSources) {
+        for (const source of mockState.unstagedSources) {
+          staged.delete(source);
+        }
+      }
+      return { staged };
+    },
+  ),
+}));
+
 vi.mock("../../media/store.js", async () => {
  const original =
    await vi.importActual<typeof import("../../media/store.js")>("../../media/store.js");
  return {
    ...original,
+    deleteMediaBuffer: vi.fn(async (id: string, subdir?: string) => {
+      mockState.deleteMediaBufferCalls.push({ id, subdir });
+    }),
    saveMediaBuffer: vi.fn(async (buffer: Buffer, contentType?: string, subdir?: string) => {
      mockState.activeSaveMediaCalls += 1;
      mockState.maxActiveSaveMediaCalls = Math.max(
@@ -465,6 +517,11 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    mockState.maxActiveSaveMediaCalls = 0;
    bindingMocks.resolveByConversation.mockReset();
    bindingMocks.resolveByConversation.mockReturnValue(null);
+    mockState.sandboxWorkspace = null;
+    mockState.stageSandboxMediaError = null;
+    mockState.stagedRelativePaths = null;
+    mockState.unstagedSources = null;
+    mockState.deleteMediaBufferCalls = [];
  });

  it("registers tool-event recipients for clients advertising tool-events capability", async () => {
@@ -1965,10 +2022,8 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
          }
        | undefined;
      expect(mockState.lastDispatchImages).toBeUndefined();
-      expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
-      expect(mockState.lastDispatchCtx?.Body).toMatch(
-        /^summarize this\n\[media attached: media:\/\/inbound\//,
-      );
+      expect(mockState.lastDispatchImageOrder).toBeUndefined();
+      expect(mockState.lastDispatchCtx?.Body).toBe("summarize this");
      expect(mockState.savedMediaCalls).toEqual([
        expect.objectContaining({
          contentType: "application/pdf",
@@ -1976,7 +2031,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
          size: expect.any(Number),
        }),
      ]);
-      expect(message?.content).toMatch(/^summarize this\n\[media attached: media:\/\/inbound\//);
+      expect(message?.content).toBe("summarize this");
      expect(message?.MediaPath).toBe("/tmp/chat-send-brief.pdf");
      expect(message?.MediaPaths).toEqual(["/tmp/chat-send-brief.pdf"]);
      expect(message?.MediaType).toBe("application/pdf");
@@ -2335,6 +2390,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
        ],
      },
      expectBroadcast: false,
+      waitFor: "none",
    });

    expect(mockState.lastDispatchImages).toBeUndefined();
@@ -2449,6 +2505,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
        ],
      },
      expectBroadcast: false,
+      waitFor: "none",
    });

    expect(mockState.lastDispatchImages).toBeUndefined();
@@ -2461,6 +2518,291 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    ]);
  });

+  it("routes non-image offloaded refs into ctx.MediaPaths + MediaTypes for chat.send", async () => {
+    createTranscriptFixture("openclaw-chat-send-non-image-ctx-media-paths-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
+    ];
+    const respond = vi.fn();
+    const context = createChatContext();
+    const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-non-image-ctx-media",
+      message: "read this",
+      requestParams: {
+        attachments: [
+          {
+            type: "file",
+            mimeType: "application/pdf",
+            fileName: "report.pdf",
+            content: pdf,
+          },
+        ],
+      },
+      expectBroadcast: false,
+    });
+
+    expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([
+      "/home/user/.openclaw/media/inbound/report.pdf",
+    ]);
+    expect(mockState.lastDispatchCtx?.MediaPath).toBe(
+      "/home/user/.openclaw/media/inbound/report.pdf",
+    );
+    expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["application/pdf"]);
+    expect(mockState.lastDispatchCtx?.MediaType).toBe("application/pdf");
+    // Non-image offloads MUST NOT inject a media://URI into the prompt body —
+    // they ride through ctx.MediaPaths so buildInboundMediaNote prepends the
+    // real path, avoiding duplicate media markers.
+    expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
+    expect(mockState.lastDispatchCtx?.BodyForAgent).not.toContain("media://");
+    expect(mockState.lastDispatchImages).toBeUndefined();
+    // Marker replaces the implicit "relative-path no-op" coupling in
+    // get-reply.ts with an explicit skip contract.
+    expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
+  });
+
+  it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => {
+    createTranscriptFixture("openclaw-chat-send-non-image-absolutize-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
+    ];
+    mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
+    mockState.stagedRelativePaths = ["media/inbound/report.pdf"];
+    const respond = vi.fn();
+    const context = createChatContext();
+    const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-non-image-absolutize",
+      message: "read this",
+      requestParams: {
+        attachments: [
+          {
+            type: "file",
+            mimeType: "application/pdf",
+            fileName: "report.pdf",
+            content: pdf,
+          },
+        ],
+      },
+      expectBroadcast: false,
+    });
+
+    expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]);
+    expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf");
+    expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace");
+    expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
+  });
+
+  it("wraps stageSandboxMedia infrastructure errors as 5xx UNAVAILABLE and cleans up media-store files", async () => {
+    createTranscriptFixture("openclaw-chat-send-stage-unavailable-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
+    ];
+    mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
+    mockState.stageSandboxMediaError = Object.assign(new Error("ENOSPC: no space left on device"), {
+      code: "ENOSPC",
+    });
+    const respond = vi.fn();
+    const context = createChatContext();
+    const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-stage-unavailable",
+      message: "read this",
+      requestParams: {
+        attachments: [
+          {
+            type: "file",
+            mimeType: "application/pdf",
+            fileName: "report.pdf",
+            content: pdf,
+          },
+        ],
+      },
+      expectBroadcast: false,
+      waitFor: "none",
+    });
+
+    // Plain Error from stageSandboxMedia would be misclassified as INVALID_REQUEST
+    // by the outer catch. Wrapping it in MediaOffloadError routes it to UNAVAILABLE
+    // so the client retries instead of treating it as a bad request.
+    expect(mockState.lastDispatchCtx).toBeUndefined();
+    expect(respond).toHaveBeenCalledTimes(1);
+    const [ok, payload, error] = respond.mock.calls[0] ?? [];
+    expect(ok).toBe(false);
+    expect(payload).toBeUndefined();
+    expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
+    expect(error?.message ?? String(error)).toMatch(/ENOSPC|non-image attachments/i);
+    // Orphaned media-store files are cleaned up before the 5xx surfaces.
+    expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
+  });
+
+  it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => {
+    // Regression: stageSandboxMedia keeps unstaged entries as their original
+    // absolute path, so a simple `stagedPaths.length === nonImage.length`
+    // check could not detect when one of the files silently fell out (e.g. a
+    // file between the RPC cap and the staging cap). Prestage must compare
+    // the returned `staged` map against the input refs.
+    createTranscriptFixture("openclaw-chat-send-partial-stage-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
+      { path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" },
+    ];
+    mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
+    mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"];
+    mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"];
+    const respond = vi.fn();
+    const context = createChatContext();
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-partial-stage",
+      message: "read these",
+      requestParams: {
+        attachments: [
+          { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
+          { type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf },
+        ],
+      },
+      expectBroadcast: false,
+      waitFor: "none",
+    });
+
+    expect(mockState.lastDispatchCtx).toBeUndefined();
+    expect(respond).toHaveBeenCalledTimes(1);
+    const [ok, payload, error] = respond.mock.calls[0] ?? [];
+    expect(ok).toBe(false);
+    expect(payload).toBeUndefined();
+    expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
+    expect(error?.message ?? String(error)).toMatch(/staging incomplete/i);
+    // Both media-store entries are cleaned up before the 5xx surfaces.
+    expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([
+      "saved-media",
+      "saved-media",
+    ]);
+  });
+
+  it("rejects sandbox-oversized non-image attachments as 4xx before staging", async () => {
+    // Regression: resolveChatAttachmentMaxBytes defaults to 20MB, but
+    // stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (5MB) and
+    // silently drops oversize files. Without a pre-check, a sandbox session
+    // accepting a 5-20MB non-image would fail staging and surface as a
+    // retryable 5xx UNAVAILABLE, misleading clients into retrying a
+    // deterministically broken request.
+    createTranscriptFixture("openclaw-chat-send-sandbox-oversize-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/huge.pdf", contentType: "application/pdf" },
+    ];
+    mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
+    const respond = vi.fn();
+    const context = createChatContext();
+    // 6MB buffer — above STAGED_MEDIA_MAX_BYTES (5MB) but below the 20MB parse cap.
+    const oversized = Buffer.alloc(6 * 1024 * 1024);
+    oversized.set(Buffer.from("%PDF-1.4\n"), 0);
+    const pdf = oversized.toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-sandbox-oversize",
+      message: "read this",
+      requestParams: {
+        attachments: [
+          { type: "file", mimeType: "application/pdf", fileName: "huge.pdf", content: pdf },
+        ],
+      },
+      expectBroadcast: false,
+      waitFor: "none",
+    });
+
+    expect(mockState.lastDispatchCtx).toBeUndefined();
+    expect(respond).toHaveBeenCalledTimes(1);
+    const [ok, payload, error] = respond.mock.calls[0] ?? [];
+    expect(ok).toBe(false);
+    expect(payload).toBeUndefined();
+    // 4xx, not 5xx — retrying a file that exceeds the staging cap cannot
+    // succeed, so the failure must be surfaced as a client-side rejection.
+    expect(error?.code).toBe(ErrorCodes.INVALID_REQUEST);
+    expect(error?.message ?? String(error)).toMatch(/sandbox staging limit/i);
+    // Orphaned media-store entries are cleaned up before the 4xx surfaces.
+    expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
+  });
+
  it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
    createTranscriptFixture("openclaw-chat-send-image-order-");
    mockState.finalText = "ok";
--- a/src/gateway/server-methods/chat.ts
+++ b/src/gateway/server-methods/chat.ts
@@ -2,16 +2,20 @@ import fs from "node:fs";
 import path from "node:path";
 import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
 import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
-import { resolveSessionAgentId } from "../../agents/agent-scope.js";
+import { resolveAgentWorkspaceDir, resolveSessionAgentId } from "../../agents/agent-scope.js";
 import { resolveThinkingDefault } from "../../agents/model-selection.js";
 import { rewriteTranscriptEntriesInSessionFile } from "../../agents/pi-embedded-runner/transcript-rewrite.js";
+import { ensureSandboxWorkspaceForSession } from "../../agents/sandbox/context.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { dispatchInboundMessage } from "../../auto-reply/dispatch.js";
 import type { ReplyPayload } from "../../auto-reply/reply-payload.js";
 import { createReplyDispatcher } from "../../auto-reply/reply/reply-dispatcher.js";
-import type { MsgContext } from "../../auto-reply/templating.js";
+import { stageSandboxMedia } from "../../auto-reply/reply/stage-sandbox-media.js";
+import type { MsgContext, TemplateContext } from "../../auto-reply/templating.js";
 import { extractCanvasFromText } from "../../chat/canvas-render.js";
 import { resolveSessionFilePath } from "../../config/sessions.js";
+import type { OpenClawConfig } from "../../config/types.openclaw.js";
+import { formatErrorMessage } from "../../infra/errors.js";
 import { jsonUtf8Bytes } from "../../infra/json-utf8-bytes.js";
 import { normalizeReplyPayloadsForDelivery } from "../../infra/outbound/payloads.js";
 import { getSessionBindingService } from "../../infra/outbound/session-binding-service.js";
@@ -22,7 +26,12 @@ import {
 } from "../../media/local-roots.js";
 import { isAudioFileName } from "../../media/mime.js";
 import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js";
-import { type SavedMedia, saveMediaBuffer } from "../../media/store.js";
+import {
+  deleteMediaBuffer,
+  MEDIA_MAX_BYTES,
+  type SavedMedia,
+  saveMediaBuffer,
+} from "../../media/store.js";
 import { createChannelReplyPipeline } from "../../plugin-sdk/channel-reply-pipeline.js";
 import { isPluginOwnedSessionBindingRecord } from "../../plugins/conversation-binding.js";
 import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js";
@@ -48,10 +57,12 @@ import {
 } from "../chat-abort.js";
 import {
  type ChatImageContent,
+  MediaOffloadError,
  type OffloadedRef,
  parseMessageWithAttachments,
+  resolveChatAttachmentMaxBytes,
+  UnsupportedAttachmentError,
 } from "../chat-attachments.js";
-import { MediaOffloadError } from "../chat-attachments.js";
 import {
  isToolHistoryBlockType,
  projectChatDisplayMessage,
@@ -712,14 +723,28 @@ async function persistChatSendImages(params: {
      );
    }
  }
-  const offloadedSaved = params.offloadedRefs.map((ref) => ({
-    id: ref.id,
-    path: ref.path,
-    size: 0,
-    contentType: ref.mimeType,
-  }));
+  // imageOrder now only tracks image slots (see chat-attachments.ts), so split
+  // offloaded refs by mime: image offloads interleave with inline images via
+  // imageOrder, and non-image offloads append to the transcript tail. Without
+  // this split a non-image file would consume the next image slot whenever
+  // both kinds appear in the same request.
+  const imageOffloadedSaved: SavedMedia[] = [];
+  const nonImageOffloadedSaved: SavedMedia[] = [];
+  for (const ref of params.offloadedRefs) {
+    const entry: SavedMedia = {
+      id: ref.id,
+      path: ref.path,
+      size: 0,
+      contentType: ref.mimeType,
+    };
+    if (ref.mimeType.startsWith("image/")) {
+      imageOffloadedSaved.push(entry);
+    } else {
+      nonImageOffloadedSaved.push(entry);
+    }
+  }
  if (params.imageOrder.length === 0) {
-    return [...inlineSaved, ...offloadedSaved];
+    return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved];
  }
  const saved: SavedMedia[] = [];
  let inlineIndex = 0;
@@ -732,7 +757,7 @@ async function persistChatSendImages(params: {
      }
      continue;
    }
-    const offloaded = offloadedSaved[offloadedIndex++];
+    const offloaded = imageOffloadedSaved[offloadedIndex++];
    if (offloaded) {
      saved.push(offloaded);
    }
@@ -743,12 +768,15 @@ async function persistChatSendImages(params: {
      saved.push(inline);
    }
  }
-  for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) {
-    const offloaded = offloadedSaved[offloadedIndex];
+  for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) {
+    const offloaded = imageOffloadedSaved[offloadedIndex];
    if (offloaded) {
      saved.push(offloaded);
    }
  }
+  for (const offloaded of nonImageOffloadedSaved) {
+    saved.push(offloaded);
+  }
  return saved;
 }

@@ -766,6 +794,115 @@ function buildChatSendTranscriptMessage(params: {
  };
 }

+// Stages non-image offloads into the agent sandbox synchronously so chat.send
+// can surface 5xx before respond(). Throws MediaOffloadError on any staging
+// failure (ENOSPC / EPERM / partial-stage) so the outer chat.send handler can
+// map it to UNAVAILABLE (5xx); plain Error would be misclassified as 4xx. All
+// offloaded refs are cleaned up from the media store before rethrow.
+// Callers MUST set ctx.MediaStaged=true when this runs so the dispatch
+// pipeline skips its own stageSandboxMedia pass.
+//
+// Returned paths are ABSOLUTE (pointing into the sandbox workspace when sandbox
+// is enabled, or the media-store origin when it is not). applyMediaUnderstanding
+// runs before any further staging in get-reply.ts and uses
+// `path.isAbsolute(raw) ? raw : path.resolve(raw)` against the gateway CWD, so
+// any relative path here would make media-understanding target the wrong host
+// path and silently skip file analysis.
+async function prestageNonImageOffloads(params: {
+  offloadedRefs: OffloadedRef[];
+  cfg: OpenClawConfig;
+  sessionKey: string;
+  agentId: string;
+}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> {
+  const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/"));
+  if (nonImage.length === 0) {
+    return { paths: [], types: [] };
+  }
+
+  try {
+    const workspaceDir = resolveAgentWorkspaceDir(params.cfg, params.agentId);
+    const sandbox = await ensureSandboxWorkspaceForSession({
+      config: params.cfg,
+      sessionKey: params.sessionKey,
+      workspaceDir,
+    });
+    if (!sandbox) {
+      return {
+        paths: nonImage.map((ref) => ref.path),
+        types: nonImage.map((ref) => ref.mimeType),
+      };
+    }
+
+    // stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (=
+    // MEDIA_MAX_BYTES, 5MB) and silently skips oversized files. The parse cap
+    // (resolveChatAttachmentMaxBytes, default 20MB) is higher, so a sandboxed
+    // session receiving a non-image file between the two caps would otherwise
+    // pass parse, fail staging, and surface as a retryable 5xx even though
+    // retry cannot succeed. Reject here as a client-side 4xx instead.
+    const oversizedForSandbox = nonImage.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
+    if (oversizedForSandbox.length > 0) {
+      const details = oversizedForSandbox
+        .map((ref) => `${ref.label} (${ref.sizeBytes} bytes)`)
+        .join(", ");
+      throw new UnsupportedAttachmentError(
+        "non-image-too-large-for-sandbox",
+        `non-image attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
+      );
+    }
+
+    const stagingCtx: MsgContext = {
+      MediaPath: nonImage[0].path,
+      MediaPaths: nonImage.map((ref) => ref.path),
+      MediaType: nonImage[0].mimeType,
+      MediaTypes: nonImage.map((ref) => ref.mimeType),
+    };
+    const stageResult = await stageSandboxMedia({
+      ctx: stagingCtx,
+      sessionCtx: stagingCtx as TemplateContext,
+      cfg: params.cfg,
+      sessionKey: params.sessionKey,
+      workspaceDir,
+    });
+
+    // stageSandboxMedia silently keeps unstaged entries as their original
+    // absolute path, so length parity with `nonImage` does not prove every
+    // file landed in the sandbox. The RPC max (20MB via
+    // resolveChatAttachmentMaxBytes) admits files above the staging cap
+    // (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
+    // missing source becomes a 5xx MediaOffloadError the client can retry.
+    const stagedSources = stageResult.staged;
+    const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
+    if (missing.length > 0) {
+      throw new Error(
+        `non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
+      );
+    }
+    const stagedPaths = stagingCtx.MediaPaths ?? [];
+    const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
+
+    // Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
+    // agent inside the container can read them. Host-side media-understanding
+    // resolves them via ctx.MediaWorkspaceDir, which we carry separately.
+    return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
+  } catch (err) {
+    await Promise.allSettled(
+      params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")),
+    );
+    if (err instanceof MediaOffloadError) {
+      throw err;
+    }
+    // Sandbox-oversize rejections are client-side 4xx (see check above). Wrapping
+    // them as MediaOffloadError would misclassify them as retryable 5xx.
+    if (err instanceof UnsupportedAttachmentError) {
+      throw err;
+    }
+    throw new MediaOffloadError(
+      `[Gateway Error] Failed to stage non-image attachments into agent workspace: ${formatErrorMessage(err)}`,
+      { cause: err },
+    );
+  }
+}
+
 function resolveChatSendTranscriptMediaFields(savedImages: SavedMedia[]) {
  const mediaPaths = savedImages.map((entry) => entry.path);
  if (mediaPaths.length === 0) {
@@ -1759,6 +1896,9 @@ export const chatHandlers: GatewayRequestHandlers = {
    let parsedImages: ChatImageContent[] = [];
    let imageOrder: PromptImageOrderEntry[] = [];
    let offloadedRefs: OffloadedRef[] = [];
+    let nonImageMediaPaths: string[] = [];
+    let nonImageMediaTypes: string[] = [];
+    let nonImageMediaWorkspaceDir: string | undefined;
    const timeoutMs = resolveAgentTimeoutMs({
      cfg,
      overrideMs: p.timeoutMs,
@@ -1833,14 +1973,27 @@ export const chatHandlers: GatewayRequestHandlers = {
        explicitOriginTargetsPlugin;
      try {
        const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, {
-          maxBytes: 5_000_000,
+          maxBytes: resolveChatAttachmentMaxBytes(cfg),
          log: context.logGateway,
          supportsImages,
+          // chat.send routes non-image offloadedRefs into ctx.MediaPaths below
+          // so the auto-reply stage pipeline can surface them to the agent.
+          acceptNonImage: true,
        });
        parsedMessage = parsed.message;
        parsedImages = parsed.images;
        imageOrder = parsed.imageOrder;
        offloadedRefs = parsed.offloadedRefs;
+        ({
+          paths: nonImageMediaPaths,
+          types: nonImageMediaTypes,
+          workspaceDir: nonImageMediaWorkspaceDir,
+        } = await prestageNonImageOffloads({
+          offloadedRefs,
+          cfg,
+          sessionKey,
+          agentId,
+        }));
      } catch (err) {
        respond(
          false,
@@ -1947,6 +2100,19 @@ export const chatHandlers: GatewayRequestHandlers = {
        GatewayClientScopes: client?.connect?.scopes ?? [],
        ...pluginBoundMediaFields,
      };
+      if (nonImageMediaPaths.length > 0) {
+        // Inject non-image offloads via the same MsgContext fields the channel
+        // path uses so buildInboundMediaNote renders a real `[media attached:
+        // <workspace-relative-path>]` line into the agent prompt. Marker
+        // blocks the dispatch pipeline from re-running stageSandboxMedia; see
+        // prestageNonImageOffloads.
+        ctx.MediaPath = nonImageMediaPaths[0];
+        ctx.MediaPaths = nonImageMediaPaths;
+        ctx.MediaType = nonImageMediaTypes[0];
+        ctx.MediaTypes = nonImageMediaTypes;
+        ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir;
+        ctx.MediaStaged = true;
+      }

      const { onModelSelected, ...replyPipeline } = createChannelReplyPipeline({
        cfg,
--- a/src/gateway/server-node-events.runtime.ts
+++ b/src/gateway/server-node-events.runtime.ts
@@ -15,7 +15,7 @@ export { enqueueSystemEvent } from "../infra/system-events.js";
 export { deleteMediaBuffer } from "../media/store.js";
 export { normalizeMainKey, scopedHeartbeatWakeOptions } from "../routing/session-key.js";
 export { defaultRuntime } from "../runtime.js";
-export { parseMessageWithAttachments } from "./chat-attachments.js";
+export { parseMessageWithAttachments, resolveChatAttachmentMaxBytes } from "./chat-attachments.js";
 export { normalizeRpcAttachmentsToChatAttachments } from "./server-methods/attachment-normalize.js";
 export {
  loadSessionEntry,
--- a/src/gateway/server-node-events.test.ts
+++ b/src/gateway/server-node-events.test.ts
@@ -90,6 +90,7 @@ const runtimeMocks = vi.hoisted(() => ({
  parseMessageWithAttachments: parseMessageWithAttachmentsMock,
  registerApnsRegistration: registerApnsRegistrationMock,
  requestHeartbeatNow: vi.fn(),
+  resolveChatAttachmentMaxBytes: vi.fn(() => 20 * 1024 * 1024),
  resolveGatewayModelSupportsImages: vi.fn(
    async ({
      loadGatewayModelCatalog,
@@ -923,7 +924,7 @@ describe("agent request events", () => {
    expect(opts.runId).toBe(opts.sessionId);
  });

-  it("passes supportsImages false for text-only node-session models", async () => {
+  it("passes supportsInlineImages false for text-only node-session models", async () => {
    const ctx = buildCtx();
    ctx.loadGatewayModelCatalog = async () => [
      {
@@ -960,7 +961,41 @@ describe("agent request events", () => {
    expect(parseMessageWithAttachmentsMock).toHaveBeenCalledWith(
      "describe",
      expect.any(Array),
-      expect.objectContaining({ supportsImages: false }),
+      expect.objectContaining({ supportsInlineImages: false }),
    );
  });
+
+  it("declines non-image attachments cleanly when parse throws UnsupportedAttachmentError", async () => {
+    const warn = vi.fn();
+    const ctx = buildCtx();
+    ctx.logGateway = { warn };
+
+    parseMessageWithAttachmentsMock.mockRejectedValueOnce(
+      Object.assign(new Error("attachment a.pdf: non-image attachments not supported"), {
+        name: "UnsupportedAttachmentError",
+        reason: "unsupported-non-image",
+      }),
+    );
+
+    await handleNodeEvent(ctx, "node-non-image-refusal", {
+      event: "agent.request",
+      payloadJSON: JSON.stringify({
+        message: "read this",
+        sessionKey: "agent:main:main",
+        attachments: [
+          {
+            type: "file",
+            mimeType: "application/pdf",
+            fileName: "a.pdf",
+            content: "JVBERi0=",
+          },
+        ],
+      }),
+    });
+
+    // server-node-events must log-and-return on parse failure — no agent
+    // dispatch, no crash, and the refusal reason bubbles up via logGateway.
+    expect(agentCommandMock).not.toHaveBeenCalled();
+    expect(warn).toHaveBeenCalledWith(expect.stringMatching(/attachment parse failed.*non-image/i));
+  });
 });
--- a/src/gateway/server-node-events.ts
+++ b/src/gateway/server-node-events.ts
@@ -26,6 +26,7 @@ import {
  parseMessageWithAttachments,
  registerApnsRegistration,
  requestHeartbeatNow,
+  resolveChatAttachmentMaxBytes,
  resolveGatewayModelSupportsImages,
  resolveOutboundTarget,
  resolveSessionAgentId,
@@ -427,16 +428,20 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt
      if (normalizedAttachments.length > 0) {
        const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg });
        const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId);
-        const supportsImages = await resolveGatewayModelSupportsImages({
+        const supportsInlineImages = await resolveGatewayModelSupportsImages({
          loadGatewayModelCatalog: ctx.loadGatewayModelCatalog,
          provider: modelRef.provider,
          model: modelRef.model,
        });
        try {
          const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
-            maxBytes: 5_000_000,
+            maxBytes: resolveChatAttachmentMaxBytes(cfg),
            log: ctx.logGateway,
-            supportsImages,
+            supportsInlineImages,
+            // server-node-events dispatches via agentCommandFromIngress which
+            // has no ctx.MediaPaths wiring; reject non-image attachments
+            // explicitly rather than saving them where the agent cannot reach them.
+            acceptNonImage: false,
          });
          message = parsed.message.trim();
          images = parsed.images;
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: {
  const cache = createMediaAttachmentCache(attachments, {
    localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
    ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
+    workspaceDir: ctx.MediaWorkspaceDir,
  });

  try {
--- a/src/media-understanding/attachments.cache.ts
+++ b/src/media-understanding/attachments.cache.ts
@@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = {
  localPathRoots?: readonly string[];
  includeDefaultLocalPathRoots?: boolean;
  ssrfPolicy?: SsrFPolicy;
+  workspaceDir?: string;
 };

 function resolveRequestUrl(input: RequestInfo | URL): string {
@@ -70,6 +71,7 @@ export class MediaAttachmentCache {
  private readonly attachments: MediaAttachment[];
  private readonly localPathRoots: readonly string[];
  private readonly ssrfPolicy: SsrFPolicy | undefined;
+  private readonly workspaceDir?: string;
  private canonicalLocalPathRoots?: Promise<readonly string[]>;

  constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
@@ -79,6 +81,7 @@ export class MediaAttachmentCache {
      options?.includeDefaultLocalPathRoots === false
        ? mergeInboundPathRoots(options.localPathRoots)
        : mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots());
+    this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined;
    for (const attachment of attachments) {
      this.entries.set(attachment.index, { attachment });
    }
@@ -293,7 +296,7 @@ export class MediaAttachmentCache {
    if (!rawPath) {
      return undefined;
    }
-    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath);
  }

  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
--- a/src/media-understanding/media-understanding-misc.test.ts
+++ b/src/media-understanding/media-understanding-misc.test.ts
@@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => {
    });
  });

+  it("resolves relative attachment paths against the provided workspaceDir", async () => {
+    await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => {
+      const workspaceDir = path.join(base, "workspace");
+      const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf");
+      await fs.mkdir(path.dirname(attachmentPath), { recursive: true });
+      await fs.writeFile(attachmentPath, "ok");
+
+      const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], {
+        localPathRoots: [workspaceDir],
+        workspaceDir,
+      });
+
+      const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
+      expect(result.buffer.toString()).toBe("ok");
+    });
+  });
+
  it("blocks local attachments outside configured roots", async () => {
    if (process.platform === "win32") {
      return;
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: {
  cfg: OpenClawConfig;
  ctx: MsgContext;
 }): readonly string[] {
+  // ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when
+  // inbound attachments were staged into a sandbox workspace. The paths in
+  // ctx.MediaPaths are kept sandbox-relative (so the agent inside the
+  // container can read them), and the workspace dir is carried separately so
+  // host-side media-understanding can still resolve them via this root list.
+  const workspaceDir = params.ctx.MediaWorkspaceDir;
  return mergeInboundPathRoots(
    getDefaultMediaLocalRoots(),
+    workspaceDir ? [path.resolve(workspaceDir)] : undefined,
    resolveChannelInboundAttachmentRoots(params),
  );
 }