fix(gateway): keep sandbox-relative MediaPaths and pass workspace context

Root cause: ctx.MediaPaths was overloaded with two incompatible meanings —
sandbox-relative for the agent runtime, host-absolute for host-side
media-understanding. The previous "absolutize in chat.send + set
MediaStaged=true" path made media-understanding work but shipped an
unreadable host path to the agent inside the sandbox.

- Keep ctx.MediaPaths sandbox-relative after prestage; carry a separate
  ctx.MediaWorkspaceDir so host-side media-understanding can still resolve
  the staged files via localPathRoots / attachment cache.
- stageSandboxMedia returns an authoritative {source -> relpath} map so
  prestageNonImageOffloads detects partial staging failures (files admitted
  by the 20MB RPC cap but rejected by the 5MB staging cap) and surfaces
  them as 5xx MediaOffloadError UNAVAILABLE.
- Reject images above MAX_IMAGE_BYTES at parse time: the agent-side
  hydration path drops them silently otherwise, producing a successful
  response with a missing image.
- Scope imageOrder to image offloads only and split persistChatSendImages
  offloaded refs by mime so non-image files append to the transcript tail
  instead of consuming image slots in mixed batches.

Signed-off-by: samzong <samzong.lu@gmail.com>
This commit is contained in:
samzong
2026-04-16 22:53:32 +08:00
committed by Frank Yang
parent bb259ac039
commit 72d0625251
9 changed files with 236 additions and 35 deletions

View File

@@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js";
const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES;
// `staged` maps every absolute source path that was copied into the sandbox
// (or remote cache) to its rewritten ctx path. Callers like chat.send's
// prestage use this to detect partial failures: unstaged sources keep their
// original absolute path in ctx.MediaPaths, so a length check against the
// input cannot distinguish "everything staged" from "silently skipped some"
// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the
// chat.send RPC already admitted under its 20MB cap).
export type StageSandboxMediaResult = {
staged: ReadonlyMap<string, string>;
};
const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() };
export async function stageSandboxMedia(params: {
ctx: MsgContext;
sessionCtx: TemplateContext;
cfg: OpenClawConfig;
sessionKey?: string;
workspaceDir: string;
}) {
}): Promise<StageSandboxMediaResult> {
const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params;
const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0;
const rawPaths = resolveRawPaths(ctx);
if (rawPaths.length === 0 || !sessionKey) {
return;
return EMPTY_STAGE_RESULT;
}
const sandbox = await ensureSandboxWorkspaceForSession({
@@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: {
: null;
const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir;
if (!effectiveWorkspaceDir) {
return;
return EMPTY_STAGE_RESULT;
}
await fs.mkdir(effectiveWorkspaceDir, { recursive: true });
@@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: {
staged,
hasPathsArray,
});
return { staged };
}
async function stageLocalFileIntoRoot(params: {

View File

@@ -20,6 +20,7 @@ vi.mock("../media/store.js", async (importOriginal) => {
});
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { MAX_IMAGE_BYTES } from "../media/constants.js";
import {
buildMessageWithAttachments,
type ChatAttachment,
@@ -192,7 +193,56 @@ describe("parseMessageWithAttachments", () => {
expect(parsed.images[0]?.mimeType).toBe("image/png");
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
expect(parsed.imageOrder).toEqual(["inline"]);
});
it("excludes non-image offloads from imageOrder in mixed batches", async () => {
// Regression: a prior revision pushed "offloaded" for every offload,
// including non-image files. In a [non-image, inline, offloaded-image]
// batch that produced imageOrder=["offloaded","inline","offloaded"] even
// though only one `[media attached: media://...]` line is ever appended
// to the prompt (for the image offload). extractTrailingAttachmentMediaUris
// then read count=2 against one trailing URI, and
// mergePromptAttachmentImages placed the single offloaded image into the
// first "offloaded" slot — swapping it ahead of the inline image.
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
const bigPng = Buffer.alloc(2_100_000);
bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0);
const { parsed } = await parseWithWarnings("x", [
{ type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 },
{
type: "image",
mimeType: "image/png",
fileName: "big.png",
content: bigPng.toString("base64"),
},
]);
expect(parsed.images).toHaveLength(1);
expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([
"application/pdf",
"image/png",
]);
expect(parsed.imageOrder).toEqual(["inline", "offloaded"]);
// The offloaded-image URI is the sole trailing media:// line, matching
// imageOrder's single "offloaded" slot.
const trailingMediaLines = parsed.message
.split("\n")
.filter((line) => line.trim().startsWith("[media attached: media://inbound/"));
expect(trailingMediaLines).toHaveLength(1);
});
it("rejects oversized images before offload", async () => {
const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64");
await expect(
parseMessageWithAttachments(
"x",
[{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }],
{ maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } },
),
).rejects.toThrow(/image exceeds size limit/i);
expect(saveMediaBufferMock).not.toHaveBeenCalled();
});
it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => {

View File

@@ -296,6 +296,13 @@ export async function parseMessageWithAttachments(
`attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`,
);
}
// Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF
// direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that
// would offload a file the runner later drops to null — a successful
// response with a silently missing image. Reject here so the client
// sees an explicit 4xx. Non-image attachments keep the full maxBytes
// ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't
// load into the model.
if (isImage && sizeBytes > MAX_IMAGE_BYTES) {
throw new Error(
`attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`,

View File

@@ -69,6 +69,11 @@ const mockState = vi.hoisted(() => ({
sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null,
stageSandboxMediaError: null as Error | null,
stagedRelativePaths: null as string[] | null,
// `unstagedSources` lets tests simulate partial staging failure: absolute
// source paths listed here are excluded from the returned `staged` map even
// though ctx still carries their rewritten paths. This mirrors how the real
// stageSandboxMedia silently skips over-cap files.
unstagedSources: null as string[] | null,
deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>,
}));
@@ -236,10 +241,26 @@ vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({
if (mockState.stageSandboxMediaError) {
throw mockState.stageSandboxMediaError;
}
const staged = new Map<string, string>();
const originalPaths = params.ctx.MediaPaths ?? [];
if (mockState.stagedRelativePaths) {
params.ctx.MediaPaths = [...mockState.stagedRelativePaths];
params.ctx.MediaPath = mockState.stagedRelativePaths[0];
const mapping = mockState.stagedRelativePaths;
params.ctx.MediaPaths = [...mapping];
params.ctx.MediaPath = mapping[0];
for (let i = 0; i < mapping.length; i += 1) {
const source = originalPaths[i];
const dest = mapping[i];
if (source && dest) {
staged.set(source, dest);
}
}
}
if (mockState.unstagedSources) {
for (const source of mockState.unstagedSources) {
staged.delete(source);
}
}
return { staged };
},
),
}));
@@ -499,6 +520,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
mockState.sandboxWorkspace = null;
mockState.stageSandboxMediaError = null;
mockState.stagedRelativePaths = null;
mockState.unstagedSources = null;
mockState.deleteMediaBufferCalls = [];
});
@@ -2557,7 +2579,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
});
it("absolutizes sandbox-relative MediaPaths so media-understanding can resolve them", async () => {
it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => {
createTranscriptFixture("openclaw-chat-send-non-image-absolutize-");
mockState.finalText = "ok";
mockState.sessionEntry = {
@@ -2599,15 +2621,9 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expectBroadcast: false,
});
// applyMediaUnderstanding resolves `path.isAbsolute(raw) ? raw : path.resolve(raw)`
// against the gateway CWD. Relative paths would miss the staged copy, so
// prestage rejoins the sandbox workspaceDir to keep the path absolute.
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([
"/sandbox/workspace/media/inbound/report.pdf",
]);
expect(mockState.lastDispatchCtx?.MediaPath).toBe(
"/sandbox/workspace/media/inbound/report.pdf",
);
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]);
expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf");
expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace");
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
});
@@ -2670,6 +2686,66 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
});
it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => {
// Regression: stageSandboxMedia keeps unstaged entries as their original
// absolute path, so a simple `stagedPaths.length === nonImage.length`
// check could not detect when one of the files silently fell out (e.g. a
// file between the RPC cap and the staging cap). Prestage must compare
// the returned `staged` map against the input refs.
createTranscriptFixture("openclaw-chat-send-partial-stage-");
mockState.finalText = "ok";
mockState.sessionEntry = {
modelProvider: "test-provider",
model: "vision-model",
};
mockState.modelCatalog = [
{
provider: "test-provider",
id: "vision-model",
name: "Vision model",
input: ["text", "image"],
},
];
mockState.savedMediaResults = [
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
{ path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" },
];
mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"];
mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"];
const respond = vi.fn();
const context = createChatContext();
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
await runNonStreamingChatSend({
context,
respond,
idempotencyKey: "idem-partial-stage",
message: "read these",
requestParams: {
attachments: [
{ type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
{ type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf },
],
},
expectBroadcast: false,
waitFor: "none",
});
expect(mockState.lastDispatchCtx).toBeUndefined();
expect(respond).toHaveBeenCalledTimes(1);
const [ok, payload, error] = respond.mock.calls[0] ?? [];
expect(ok).toBe(false);
expect(payload).toBeUndefined();
expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
expect(error?.message ?? String(error)).toMatch(/staging incomplete/i);
// Both media-store entries are cleaned up before the 5xx surfaces.
expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([
"saved-media",
"saved-media",
]);
});
it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
createTranscriptFixture("openclaw-chat-send-image-order-");
mockState.finalText = "ok";

View File

@@ -717,14 +717,28 @@ async function persistChatSendImages(params: {
);
}
}
const offloadedSaved = params.offloadedRefs.map((ref) => ({
id: ref.id,
path: ref.path,
size: 0,
contentType: ref.mimeType,
}));
// imageOrder now only tracks image slots (see chat-attachments.ts), so split
// offloaded refs by mime: image offloads interleave with inline images via
// imageOrder, and non-image offloads append to the transcript tail. Without
// this split a non-image file would consume the next image slot whenever
// both kinds appear in the same request.
const imageOffloadedSaved: SavedMedia[] = [];
const nonImageOffloadedSaved: SavedMedia[] = [];
for (const ref of params.offloadedRefs) {
const entry: SavedMedia = {
id: ref.id,
path: ref.path,
size: 0,
contentType: ref.mimeType,
};
if (ref.mimeType.startsWith("image/")) {
imageOffloadedSaved.push(entry);
} else {
nonImageOffloadedSaved.push(entry);
}
}
if (params.imageOrder.length === 0) {
return [...inlineSaved, ...offloadedSaved];
return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved];
}
const saved: SavedMedia[] = [];
let inlineIndex = 0;
@@ -737,7 +751,7 @@ async function persistChatSendImages(params: {
}
continue;
}
const offloaded = offloadedSaved[offloadedIndex++];
const offloaded = imageOffloadedSaved[offloadedIndex++];
if (offloaded) {
saved.push(offloaded);
}
@@ -748,12 +762,15 @@ async function persistChatSendImages(params: {
saved.push(inline);
}
}
for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) {
const offloaded = offloadedSaved[offloadedIndex];
for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) {
const offloaded = imageOffloadedSaved[offloadedIndex];
if (offloaded) {
saved.push(offloaded);
}
}
for (const offloaded of nonImageOffloadedSaved) {
saved.push(offloaded);
}
return saved;
}
@@ -816,7 +833,7 @@ async function prestageNonImageOffloads(params: {
MediaType: nonImage[0].mimeType,
MediaTypes: nonImage.map((ref) => ref.mimeType),
};
await stageSandboxMedia({
const stageResult = await stageSandboxMedia({
ctx: stagingCtx,
sessionCtx: stagingCtx as TemplateContext,
cfg: params.cfg,
@@ -824,18 +841,26 @@ async function prestageNonImageOffloads(params: {
workspaceDir,
});
const stagedPaths = stagingCtx.MediaPaths ?? [];
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
if (stagedPaths.length !== nonImage.length) {
// stageSandboxMedia silently keeps unstaged entries as their original
// absolute path, so length parity with `nonImage` does not prove every
// file landed in the sandbox. The RPC max (20MB via
// resolveChatAttachmentMaxBytes) admits files above the staging cap
// (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
// missing source becomes a 5xx MediaOffloadError the client can retry.
const stagedSources = stageResult.staged;
const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
if (missing.length > 0) {
throw new Error(
`non-image attachment staging incomplete: ${stagedPaths.length}/${nonImage.length} paths staged into sandbox workspace`,
`non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
);
}
const stagedPaths = stagingCtx.MediaPaths ?? [];
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
const absolutePaths = stagedPaths.map((p) =>
path.isAbsolute(p) ? p : path.join(sandbox.workspaceDir, p),
);
return { paths: absolutePaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
// Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
// agent inside the container can read them. Host-side media-understanding
// resolves them via ctx.MediaWorkspaceDir, which we carry separately.
return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
} catch (err) {
await Promise.allSettled(
params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")),

View File

@@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: {
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
workspaceDir: ctx.MediaWorkspaceDir,
});
try {

View File

@@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = {
localPathRoots?: readonly string[];
includeDefaultLocalPathRoots?: boolean;
ssrfPolicy?: SsrFPolicy;
workspaceDir?: string;
};
function resolveRequestUrl(input: RequestInfo | URL): string {
@@ -70,6 +71,7 @@ export class MediaAttachmentCache {
private readonly attachments: MediaAttachment[];
private readonly localPathRoots: readonly string[];
private readonly ssrfPolicy: SsrFPolicy | undefined;
private readonly workspaceDir?: string;
private canonicalLocalPathRoots?: Promise<readonly string[]>;
constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
@@ -79,6 +81,7 @@ export class MediaAttachmentCache {
options?.includeDefaultLocalPathRoots === false
? mergeInboundPathRoots(options.localPathRoots)
: mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots());
this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined;
for (const attachment of attachments) {
this.entries.set(attachment.index, { attachment });
}
@@ -293,7 +296,7 @@ export class MediaAttachmentCache {
if (!rawPath) {
return undefined;
}
return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath);
}
private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {

View File

@@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => {
});
});
it("resolves relative attachment paths against the provided workspaceDir", async () => {
await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => {
const workspaceDir = path.join(base, "workspace");
const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf");
await fs.mkdir(path.dirname(attachmentPath), { recursive: true });
await fs.writeFile(attachmentPath, "ok");
const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], {
localPathRoots: [workspaceDir],
workspaceDir,
});
const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
expect(result.buffer.toString()).toBe("ok");
});
});
it("blocks local attachments outside configured roots", async () => {
if (process.platform === "win32") {
return;

View File

@@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: {
cfg: OpenClawConfig;
ctx: MsgContext;
}): readonly string[] {
// ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when
// inbound attachments were staged into a sandbox workspace. The paths in
// ctx.MediaPaths are kept sandbox-relative (so the agent inside the
// container can read them), and the workspace dir is carried separately so
// host-side media-understanding can still resolve them via this root list.
const workspaceDir = params.ctx.MediaWorkspaceDir;
return mergeInboundPathRoots(
getDefaultMediaLocalRoots(),
workspaceDir ? [path.resolve(workspaceDir)] : undefined,
resolveChannelInboundAttachmentRoots(params),
);
}