fix(gateway): keep sandbox-relative MediaPaths and pass workspace context

Root cause: ctx.MediaPaths was overloaded with two incompatible meanings — sandbox-relative for the agent runtime, host-absolute for host-side media-understanding. The previous "absolutize in chat.send + set MediaStaged=true" path made media-understanding work but shipped an unreadable host path to the agent inside the sandbox. - Keep ctx.MediaPaths sandbox-relative after prestage; carry a separate ctx.MediaWorkspaceDir so host-side media-understanding can still resolve the staged files via localPathRoots / attachment cache. - stageSandboxMedia returns an authoritative {source -> relpath} map so prestageNonImageOffloads detects partial staging failures (files admitted by the 20MB RPC cap but rejected by the 5MB staging cap) and surfaces them as 5xx MediaOffloadError UNAVAILABLE. - Reject images above MAX_IMAGE_BYTES at parse time: the agent-side hydration path drops them silently otherwise, producing a successful response with a missing image. - Scope imageOrder to image offloads only and split persistChatSendImages offloaded refs by mime so non-image files append to the transcript tail instead of consuming image slots in mixed batches. Signed-off-by: samzong <samzong.lu@gmail.com>
2026-05-06 08:50:43 +00:00 · 2026-04-16 22:53:32 +08:00
parent bb259ac039
commit 72d0625251
9 changed files with 236 additions and 35 deletions
--- a/src/auto-reply/reply/stage-sandbox-media.ts
+++ b/src/auto-reply/reply/stage-sandbox-media.ts
@@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js";

 const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES;

+// `staged` maps every absolute source path that was copied into the sandbox
+// (or remote cache) to its rewritten ctx path. Callers like chat.send's
+// prestage use this to detect partial failures: unstaged sources keep their
+// original absolute path in ctx.MediaPaths, so a length check against the
+// input cannot distinguish "everything staged" from "silently skipped some"
+// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the
+// chat.send RPC already admitted under its 20MB cap).
+export type StageSandboxMediaResult = {
+  staged: ReadonlyMap<string, string>;
+};
+
+const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() };
+
 export async function stageSandboxMedia(params: {
  ctx: MsgContext;
  sessionCtx: TemplateContext;
  cfg: OpenClawConfig;
  sessionKey?: string;
  workspaceDir: string;
-}) {
+}): Promise<StageSandboxMediaResult> {
  const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params;
  const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0;
  const rawPaths = resolveRawPaths(ctx);
  if (rawPaths.length === 0 || !sessionKey) {
-    return;
+    return EMPTY_STAGE_RESULT;
  }

  const sandbox = await ensureSandboxWorkspaceForSession({
@@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: {
    : null;
  const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir;
  if (!effectiveWorkspaceDir) {
-    return;
+    return EMPTY_STAGE_RESULT;
  }

  await fs.mkdir(effectiveWorkspaceDir, { recursive: true });
@@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: {
    staged,
    hasPathsArray,
  });
+
+  return { staged };
 }

 async function stageLocalFileIntoRoot(params: {
--- a/src/gateway/chat-attachments.test.ts
+++ b/src/gateway/chat-attachments.test.ts
@@ -20,6 +20,7 @@ vi.mock("../media/store.js", async (importOriginal) => {
 });

 import type { OpenClawConfig } from "../config/types.openclaw.js";
+import { MAX_IMAGE_BYTES } from "../media/constants.js";
 import {
  buildMessageWithAttachments,
  type ChatAttachment,
@@ -192,7 +193,56 @@ describe("parseMessageWithAttachments", () => {
    expect(parsed.images[0]?.mimeType).toBe("image/png");
    expect(parsed.offloadedRefs).toHaveLength(1);
    expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
+    expect(parsed.imageOrder).toEqual(["inline"]);
+  });
+
+  it("excludes non-image offloads from imageOrder in mixed batches", async () => {
+    // Regression: a prior revision pushed "offloaded" for every offload,
+    // including non-image files. In a [non-image, inline, offloaded-image]
+    // batch that produced imageOrder=["offloaded","inline","offloaded"] even
+    // though only one `[media attached: media://...]` line is ever appended
+    // to the prompt (for the image offload). extractTrailingAttachmentMediaUris
+    // then read count=2 against one trailing URI, and
+    // mergePromptAttachmentImages placed the single offloaded image into the
+    // first "offloaded" slot — swapping it ahead of the inline image.
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+    const bigPng = Buffer.alloc(2_100_000);
+    bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0);
+    const { parsed } = await parseWithWarnings("x", [
+      { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
+      { type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 },
+      {
+        type: "image",
+        mimeType: "image/png",
+        fileName: "big.png",
+        content: bigPng.toString("base64"),
+      },
+    ]);
+    expect(parsed.images).toHaveLength(1);
+    expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([
+      "application/pdf",
+      "image/png",
+    ]);
    expect(parsed.imageOrder).toEqual(["inline", "offloaded"]);
+    // The offloaded-image URI is the sole trailing media:// line, matching
+    // imageOrder's single "offloaded" slot.
+    const trailingMediaLines = parsed.message
+      .split("\n")
+      .filter((line) => line.trim().startsWith("[media attached: media://inbound/"));
+    expect(trailingMediaLines).toHaveLength(1);
+  });
+
+  it("rejects oversized images before offload", async () => {
+    const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64");
+
+    await expect(
+      parseMessageWithAttachments(
+        "x",
+        [{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }],
+        { maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } },
+      ),
+    ).rejects.toThrow(/image exceeds size limit/i);
+    expect(saveMediaBufferMock).not.toHaveBeenCalled();
  });

  it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => {
--- a/src/gateway/chat-attachments.ts
+++ b/src/gateway/chat-attachments.ts
@@ -296,6 +296,13 @@ export async function parseMessageWithAttachments(
          `attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`,
        );
      }
+      // Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF
+      // direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that
+      // would offload a file the runner later drops to null — a successful
+      // response with a silently missing image. Reject here so the client
+      // sees an explicit 4xx. Non-image attachments keep the full maxBytes
+      // ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't
+      // load into the model.
      if (isImage && sizeBytes > MAX_IMAGE_BYTES) {
        throw new Error(
          `attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`,
--- a/src/gateway/server-methods/chat.directive-tags.test.ts
+++ b/src/gateway/server-methods/chat.directive-tags.test.ts
@@ -69,6 +69,11 @@ const mockState = vi.hoisted(() => ({
  sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null,
  stageSandboxMediaError: null as Error | null,
  stagedRelativePaths: null as string[] | null,
+  // `unstagedSources` lets tests simulate partial staging failure: absolute
+  // source paths listed here are excluded from the returned `staged` map even
+  // though ctx still carries their rewritten paths. This mirrors how the real
+  // stageSandboxMedia silently skips over-cap files.
+  unstagedSources: null as string[] | null,
  deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>,
 }));

@@ -236,10 +241,26 @@ vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({
      if (mockState.stageSandboxMediaError) {
        throw mockState.stageSandboxMediaError;
      }
+      const staged = new Map<string, string>();
+      const originalPaths = params.ctx.MediaPaths ?? [];
      if (mockState.stagedRelativePaths) {
-        params.ctx.MediaPaths = [...mockState.stagedRelativePaths];
-        params.ctx.MediaPath = mockState.stagedRelativePaths[0];
+        const mapping = mockState.stagedRelativePaths;
+        params.ctx.MediaPaths = [...mapping];
+        params.ctx.MediaPath = mapping[0];
+        for (let i = 0; i < mapping.length; i += 1) {
+          const source = originalPaths[i];
+          const dest = mapping[i];
+          if (source && dest) {
+            staged.set(source, dest);
+          }
+        }
      }
+      if (mockState.unstagedSources) {
+        for (const source of mockState.unstagedSources) {
+          staged.delete(source);
+        }
+      }
+      return { staged };
    },
  ),
 }));
@@ -499,6 +520,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    mockState.sandboxWorkspace = null;
    mockState.stageSandboxMediaError = null;
    mockState.stagedRelativePaths = null;
+    mockState.unstagedSources = null;
    mockState.deleteMediaBufferCalls = [];
  });

@@ -2557,7 +2579,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
  });

-  it("absolutizes sandbox-relative MediaPaths so media-understanding can resolve them", async () => {
+  it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => {
    createTranscriptFixture("openclaw-chat-send-non-image-absolutize-");
    mockState.finalText = "ok";
    mockState.sessionEntry = {
@@ -2599,15 +2621,9 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      expectBroadcast: false,
    });

-    // applyMediaUnderstanding resolves `path.isAbsolute(raw) ? raw : path.resolve(raw)`
-    // against the gateway CWD. Relative paths would miss the staged copy, so
-    // prestage rejoins the sandbox workspaceDir to keep the path absolute.
-    expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([
-      "/sandbox/workspace/media/inbound/report.pdf",
-    ]);
-    expect(mockState.lastDispatchCtx?.MediaPath).toBe(
-      "/sandbox/workspace/media/inbound/report.pdf",
-    );
+    expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]);
+    expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf");
+    expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace");
    expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
  });

@@ -2670,6 +2686,66 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
  });

+  it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => {
+    // Regression: stageSandboxMedia keeps unstaged entries as their original
+    // absolute path, so a simple `stagedPaths.length === nonImage.length`
+    // check could not detect when one of the files silently fell out (e.g. a
+    // file between the RPC cap and the staging cap). Prestage must compare
+    // the returned `staged` map against the input refs.
+    createTranscriptFixture("openclaw-chat-send-partial-stage-");
+    mockState.finalText = "ok";
+    mockState.sessionEntry = {
+      modelProvider: "test-provider",
+      model: "vision-model",
+    };
+    mockState.modelCatalog = [
+      {
+        provider: "test-provider",
+        id: "vision-model",
+        name: "Vision model",
+        input: ["text", "image"],
+      },
+    ];
+    mockState.savedMediaResults = [
+      { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
+      { path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" },
+    ];
+    mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
+    mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"];
+    mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"];
+    const respond = vi.fn();
+    const context = createChatContext();
+    const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
+
+    await runNonStreamingChatSend({
+      context,
+      respond,
+      idempotencyKey: "idem-partial-stage",
+      message: "read these",
+      requestParams: {
+        attachments: [
+          { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
+          { type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf },
+        ],
+      },
+      expectBroadcast: false,
+      waitFor: "none",
+    });
+
+    expect(mockState.lastDispatchCtx).toBeUndefined();
+    expect(respond).toHaveBeenCalledTimes(1);
+    const [ok, payload, error] = respond.mock.calls[0] ?? [];
+    expect(ok).toBe(false);
+    expect(payload).toBeUndefined();
+    expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
+    expect(error?.message ?? String(error)).toMatch(/staging incomplete/i);
+    // Both media-store entries are cleaned up before the 5xx surfaces.
+    expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([
+      "saved-media",
+      "saved-media",
+    ]);
+  });
+
  it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
    createTranscriptFixture("openclaw-chat-send-image-order-");
    mockState.finalText = "ok";
--- a/src/gateway/server-methods/chat.ts
+++ b/src/gateway/server-methods/chat.ts
@@ -717,14 +717,28 @@ async function persistChatSendImages(params: {
      );
    }
  }
-  const offloadedSaved = params.offloadedRefs.map((ref) => ({
-    id: ref.id,
-    path: ref.path,
-    size: 0,
-    contentType: ref.mimeType,
-  }));
+  // imageOrder now only tracks image slots (see chat-attachments.ts), so split
+  // offloaded refs by mime: image offloads interleave with inline images via
+  // imageOrder, and non-image offloads append to the transcript tail. Without
+  // this split a non-image file would consume the next image slot whenever
+  // both kinds appear in the same request.
+  const imageOffloadedSaved: SavedMedia[] = [];
+  const nonImageOffloadedSaved: SavedMedia[] = [];
+  for (const ref of params.offloadedRefs) {
+    const entry: SavedMedia = {
+      id: ref.id,
+      path: ref.path,
+      size: 0,
+      contentType: ref.mimeType,
+    };
+    if (ref.mimeType.startsWith("image/")) {
+      imageOffloadedSaved.push(entry);
+    } else {
+      nonImageOffloadedSaved.push(entry);
+    }
+  }
  if (params.imageOrder.length === 0) {
-    return [...inlineSaved, ...offloadedSaved];
+    return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved];
  }
  const saved: SavedMedia[] = [];
  let inlineIndex = 0;
@@ -737,7 +751,7 @@ async function persistChatSendImages(params: {
      }
      continue;
    }
-    const offloaded = offloadedSaved[offloadedIndex++];
+    const offloaded = imageOffloadedSaved[offloadedIndex++];
    if (offloaded) {
      saved.push(offloaded);
    }
@@ -748,12 +762,15 @@ async function persistChatSendImages(params: {
      saved.push(inline);
    }
  }
-  for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) {
-    const offloaded = offloadedSaved[offloadedIndex];
+  for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) {
+    const offloaded = imageOffloadedSaved[offloadedIndex];
    if (offloaded) {
      saved.push(offloaded);
    }
  }
+  for (const offloaded of nonImageOffloadedSaved) {
+    saved.push(offloaded);
+  }
  return saved;
 }

@@ -816,7 +833,7 @@ async function prestageNonImageOffloads(params: {
      MediaType: nonImage[0].mimeType,
      MediaTypes: nonImage.map((ref) => ref.mimeType),
    };
-    await stageSandboxMedia({
+    const stageResult = await stageSandboxMedia({
      ctx: stagingCtx,
      sessionCtx: stagingCtx as TemplateContext,
      cfg: params.cfg,
@@ -824,18 +841,26 @@ async function prestageNonImageOffloads(params: {
      workspaceDir,
    });

-    const stagedPaths = stagingCtx.MediaPaths ?? [];
-    const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
-    if (stagedPaths.length !== nonImage.length) {
+    // stageSandboxMedia silently keeps unstaged entries as their original
+    // absolute path, so length parity with `nonImage` does not prove every
+    // file landed in the sandbox. The RPC max (20MB via
+    // resolveChatAttachmentMaxBytes) admits files above the staging cap
+    // (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
+    // missing source becomes a 5xx MediaOffloadError the client can retry.
+    const stagedSources = stageResult.staged;
+    const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
+    if (missing.length > 0) {
      throw new Error(
-        `non-image attachment staging incomplete: ${stagedPaths.length}/${nonImage.length} paths staged into sandbox workspace`,
+        `non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
      );
    }
+    const stagedPaths = stagingCtx.MediaPaths ?? [];
+    const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);

-    const absolutePaths = stagedPaths.map((p) =>
-      path.isAbsolute(p) ? p : path.join(sandbox.workspaceDir, p),
-    );
-    return { paths: absolutePaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
+    // Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
+    // agent inside the container can read them. Host-side media-understanding
+    // resolves them via ctx.MediaWorkspaceDir, which we carry separately.
+    return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
  } catch (err) {
    await Promise.allSettled(
      params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")),
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: {
  const cache = createMediaAttachmentCache(attachments, {
    localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
    ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
+    workspaceDir: ctx.MediaWorkspaceDir,
  });

  try {
--- a/src/media-understanding/attachments.cache.ts
+++ b/src/media-understanding/attachments.cache.ts
@@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = {
  localPathRoots?: readonly string[];
  includeDefaultLocalPathRoots?: boolean;
  ssrfPolicy?: SsrFPolicy;
+  workspaceDir?: string;
 };

 function resolveRequestUrl(input: RequestInfo | URL): string {
@@ -70,6 +71,7 @@ export class MediaAttachmentCache {
  private readonly attachments: MediaAttachment[];
  private readonly localPathRoots: readonly string[];
  private readonly ssrfPolicy: SsrFPolicy | undefined;
+  private readonly workspaceDir?: string;
  private canonicalLocalPathRoots?: Promise<readonly string[]>;

  constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
@@ -79,6 +81,7 @@ export class MediaAttachmentCache {
      options?.includeDefaultLocalPathRoots === false
        ? mergeInboundPathRoots(options.localPathRoots)
        : mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots());
+    this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined;
    for (const attachment of attachments) {
      this.entries.set(attachment.index, { attachment });
    }
@@ -293,7 +296,7 @@ export class MediaAttachmentCache {
    if (!rawPath) {
      return undefined;
    }
-    return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
+    return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath);
  }

  private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
--- a/src/media-understanding/media-understanding-misc.test.ts
+++ b/src/media-understanding/media-understanding-misc.test.ts
@@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => {
    });
  });

+  it("resolves relative attachment paths against the provided workspaceDir", async () => {
+    await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => {
+      const workspaceDir = path.join(base, "workspace");
+      const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf");
+      await fs.mkdir(path.dirname(attachmentPath), { recursive: true });
+      await fs.writeFile(attachmentPath, "ok");
+
+      const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], {
+        localPathRoots: [workspaceDir],
+        workspaceDir,
+      });
+
+      const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
+      expect(result.buffer.toString()).toBe("ok");
+    });
+  });
+
  it("blocks local attachments outside configured roots", async () => {
    if (process.platform === "win32") {
      return;
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: {
  cfg: OpenClawConfig;
  ctx: MsgContext;
 }): readonly string[] {
+  // ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when
+  // inbound attachments were staged into a sandbox workspace. The paths in
+  // ctx.MediaPaths are kept sandbox-relative (so the agent inside the
+  // container can read them), and the workspace dir is carried separately so
+  // host-side media-understanding can still resolve them via this root list.
+  const workspaceDir = params.ctx.MediaWorkspaceDir;
  return mergeInboundPathRoots(
    getDefaultMediaLocalRoots(),
+    workspaceDir ? [path.resolve(workspaceDir)] : undefined,
    resolveChannelInboundAttachmentRoots(params),
  );
 }