From 72d062525103a04839969038b345caaa6a387aca Mon Sep 17 00:00:00 2001 From: samzong Date: Thu, 16 Apr 2026 22:53:32 +0800 Subject: [PATCH] fix(gateway): keep sandbox-relative MediaPaths and pass workspace context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: ctx.MediaPaths was overloaded with two incompatible meanings — sandbox-relative for the agent runtime, host-absolute for host-side media-understanding. The previous "absolutize in chat.send + set MediaStaged=true" path made media-understanding work but shipped an unreadable host path to the agent inside the sandbox. - Keep ctx.MediaPaths sandbox-relative after prestage; carry a separate ctx.MediaWorkspaceDir so host-side media-understanding can still resolve the staged files via localPathRoots / attachment cache. - stageSandboxMedia returns an authoritative {source -> relpath} map so prestageNonImageOffloads detects partial staging failures (files admitted by the 20MB RPC cap but rejected by the 5MB staging cap) and surfaces them as 5xx MediaOffloadError UNAVAILABLE. - Reject images above MAX_IMAGE_BYTES at parse time: the agent-side hydration path drops them silently otherwise, producing a successful response with a missing image. - Scope imageOrder to image offloads only and split persistChatSendImages offloaded refs by mime so non-image files append to the transcript tail instead of consuming image slots in mixed batches. Signed-off-by: samzong --- src/auto-reply/reply/stage-sandbox-media.ts | 21 +++- src/gateway/chat-attachments.test.ts | 50 +++++++++ src/gateway/chat-attachments.ts | 7 ++ .../chat.directive-tags.test.ts | 100 +++++++++++++++--- src/gateway/server-methods/chat.ts | 63 +++++++---- src/media-understanding/apply.ts | 1 + src/media-understanding/attachments.cache.ts | 5 +- .../media-understanding-misc.test.ts | 17 +++ src/media-understanding/runner.ts | 7 ++ 9 files changed, 236 insertions(+), 35 deletions(-) diff --git a/src/auto-reply/reply/stage-sandbox-media.ts b/src/auto-reply/reply/stage-sandbox-media.ts index 1689fdbbb6d..d10a92c80e2 100644 --- a/src/auto-reply/reply/stage-sandbox-media.ts +++ b/src/auto-reply/reply/stage-sandbox-media.ts @@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js"; const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES; +// `staged` maps every absolute source path that was copied into the sandbox +// (or remote cache) to its rewritten ctx path. Callers like chat.send's +// prestage use this to detect partial failures: unstaged sources keep their +// original absolute path in ctx.MediaPaths, so a length check against the +// input cannot distinguish "everything staged" from "silently skipped some" +// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the +// chat.send RPC already admitted under its 20MB cap). +export type StageSandboxMediaResult = { + staged: ReadonlyMap; +}; + +const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() }; + export async function stageSandboxMedia(params: { ctx: MsgContext; sessionCtx: TemplateContext; cfg: OpenClawConfig; sessionKey?: string; workspaceDir: string; -}) { +}): Promise { const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params; const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0; const rawPaths = resolveRawPaths(ctx); if (rawPaths.length === 0 || !sessionKey) { - return; + return EMPTY_STAGE_RESULT; } const sandbox = await ensureSandboxWorkspaceForSession({ @@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: { : null; const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir; if (!effectiveWorkspaceDir) { - return; + return EMPTY_STAGE_RESULT; } await fs.mkdir(effectiveWorkspaceDir, { recursive: true }); @@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: { staged, hasPathsArray, }); + + return { staged }; } async function stageLocalFileIntoRoot(params: { diff --git a/src/gateway/chat-attachments.test.ts b/src/gateway/chat-attachments.test.ts index 7f30da554fa..78deb4c670f 100644 --- a/src/gateway/chat-attachments.test.ts +++ b/src/gateway/chat-attachments.test.ts @@ -20,6 +20,7 @@ vi.mock("../media/store.js", async (importOriginal) => { }); import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { MAX_IMAGE_BYTES } from "../media/constants.js"; import { buildMessageWithAttachments, type ChatAttachment, @@ -192,7 +193,56 @@ describe("parseMessageWithAttachments", () => { expect(parsed.images[0]?.mimeType).toBe("image/png"); expect(parsed.offloadedRefs).toHaveLength(1); expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf"); + expect(parsed.imageOrder).toEqual(["inline"]); + }); + + it("excludes non-image offloads from imageOrder in mixed batches", async () => { + // Regression: a prior revision pushed "offloaded" for every offload, + // including non-image files. In a [non-image, inline, offloaded-image] + // batch that produced imageOrder=["offloaded","inline","offloaded"] even + // though only one `[media attached: media://...]` line is ever appended + // to the prompt (for the image offload). extractTrailingAttachmentMediaUris + // then read count=2 against one trailing URI, and + // mergePromptAttachmentImages placed the single offloaded image into the + // first "offloaded" slot — swapping it ahead of the inline image. + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + const bigPng = Buffer.alloc(2_100_000); + bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0); + const { parsed } = await parseWithWarnings("x", [ + { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf }, + { type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }, + { + type: "image", + mimeType: "image/png", + fileName: "big.png", + content: bigPng.toString("base64"), + }, + ]); + expect(parsed.images).toHaveLength(1); + expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([ + "application/pdf", + "image/png", + ]); expect(parsed.imageOrder).toEqual(["inline", "offloaded"]); + // The offloaded-image URI is the sole trailing media:// line, matching + // imageOrder's single "offloaded" slot. + const trailingMediaLines = parsed.message + .split("\n") + .filter((line) => line.trim().startsWith("[media attached: media://inbound/")); + expect(trailingMediaLines).toHaveLength(1); + }); + + it("rejects oversized images before offload", async () => { + const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64"); + + await expect( + parseMessageWithAttachments( + "x", + [{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }], + { maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } }, + ), + ).rejects.toThrow(/image exceeds size limit/i); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); }); it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => { diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 7f61e02d5b2..89a4cca1775 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -296,6 +296,13 @@ export async function parseMessageWithAttachments( `attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`, ); } + // Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF + // direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that + // would offload a file the runner later drops to null — a successful + // response with a silently missing image. Reject here so the client + // sees an explicit 4xx. Non-image attachments keep the full maxBytes + // ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't + // load into the model. if (isImage && sizeBytes > MAX_IMAGE_BYTES) { throw new Error( `attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`, diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index fb1f6f0a302..d32f8cb7b40 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -69,6 +69,11 @@ const mockState = vi.hoisted(() => ({ sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null, stageSandboxMediaError: null as Error | null, stagedRelativePaths: null as string[] | null, + // `unstagedSources` lets tests simulate partial staging failure: absolute + // source paths listed here are excluded from the returned `staged` map even + // though ctx still carries their rewritten paths. This mirrors how the real + // stageSandboxMedia silently skips over-cap files. + unstagedSources: null as string[] | null, deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>, })); @@ -236,10 +241,26 @@ vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({ if (mockState.stageSandboxMediaError) { throw mockState.stageSandboxMediaError; } + const staged = new Map(); + const originalPaths = params.ctx.MediaPaths ?? []; if (mockState.stagedRelativePaths) { - params.ctx.MediaPaths = [...mockState.stagedRelativePaths]; - params.ctx.MediaPath = mockState.stagedRelativePaths[0]; + const mapping = mockState.stagedRelativePaths; + params.ctx.MediaPaths = [...mapping]; + params.ctx.MediaPath = mapping[0]; + for (let i = 0; i < mapping.length; i += 1) { + const source = originalPaths[i]; + const dest = mapping[i]; + if (source && dest) { + staged.set(source, dest); + } + } } + if (mockState.unstagedSources) { + for (const source of mockState.unstagedSources) { + staged.delete(source); + } + } + return { staged }; }, ), })); @@ -499,6 +520,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => mockState.sandboxWorkspace = null; mockState.stageSandboxMediaError = null; mockState.stagedRelativePaths = null; + mockState.unstagedSources = null; mockState.deleteMediaBufferCalls = []; }); @@ -2557,7 +2579,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true); }); - it("absolutizes sandbox-relative MediaPaths so media-understanding can resolve them", async () => { + it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => { createTranscriptFixture("openclaw-chat-send-non-image-absolutize-"); mockState.finalText = "ok"; mockState.sessionEntry = { @@ -2599,15 +2621,9 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expectBroadcast: false, }); - // applyMediaUnderstanding resolves `path.isAbsolute(raw) ? raw : path.resolve(raw)` - // against the gateway CWD. Relative paths would miss the staged copy, so - // prestage rejoins the sandbox workspaceDir to keep the path absolute. - expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([ - "/sandbox/workspace/media/inbound/report.pdf", - ]); - expect(mockState.lastDispatchCtx?.MediaPath).toBe( - "/sandbox/workspace/media/inbound/report.pdf", - ); + expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]); + expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf"); + expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace"); expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true); }); @@ -2670,6 +2686,66 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]); }); + it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => { + // Regression: stageSandboxMedia keeps unstaged entries as their original + // absolute path, so a simple `stagedPaths.length === nonImage.length` + // check could not detect when one of the files silently fell out (e.g. a + // file between the RPC cap and the staging cap). Prestage must compare + // the returned `staged` map against the input refs. + createTranscriptFixture("openclaw-chat-send-partial-stage-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" }, + { path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" }, + ]; + mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" }; + mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"]; + mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"]; + const respond = vi.fn(); + const context = createChatContext(); + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-partial-stage", + message: "read these", + requestParams: { + attachments: [ + { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf }, + { type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf }, + ], + }, + expectBroadcast: false, + waitFor: "none", + }); + + expect(mockState.lastDispatchCtx).toBeUndefined(); + expect(respond).toHaveBeenCalledTimes(1); + const [ok, payload, error] = respond.mock.calls[0] ?? []; + expect(ok).toBe(false); + expect(payload).toBeUndefined(); + expect(error?.code).toBe(ErrorCodes.UNAVAILABLE); + expect(error?.message ?? String(error)).toMatch(/staging incomplete/i); + // Both media-store entries are cleaned up before the 5xx surfaces. + expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([ + "saved-media", + "saved-media", + ]); + }); + it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => { createTranscriptFixture("openclaw-chat-send-image-order-"); mockState.finalText = "ok"; diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index 9ef2aa4fcb2..0424460be24 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -717,14 +717,28 @@ async function persistChatSendImages(params: { ); } } - const offloadedSaved = params.offloadedRefs.map((ref) => ({ - id: ref.id, - path: ref.path, - size: 0, - contentType: ref.mimeType, - })); + // imageOrder now only tracks image slots (see chat-attachments.ts), so split + // offloaded refs by mime: image offloads interleave with inline images via + // imageOrder, and non-image offloads append to the transcript tail. Without + // this split a non-image file would consume the next image slot whenever + // both kinds appear in the same request. + const imageOffloadedSaved: SavedMedia[] = []; + const nonImageOffloadedSaved: SavedMedia[] = []; + for (const ref of params.offloadedRefs) { + const entry: SavedMedia = { + id: ref.id, + path: ref.path, + size: 0, + contentType: ref.mimeType, + }; + if (ref.mimeType.startsWith("image/")) { + imageOffloadedSaved.push(entry); + } else { + nonImageOffloadedSaved.push(entry); + } + } if (params.imageOrder.length === 0) { - return [...inlineSaved, ...offloadedSaved]; + return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved]; } const saved: SavedMedia[] = []; let inlineIndex = 0; @@ -737,7 +751,7 @@ async function persistChatSendImages(params: { } continue; } - const offloaded = offloadedSaved[offloadedIndex++]; + const offloaded = imageOffloadedSaved[offloadedIndex++]; if (offloaded) { saved.push(offloaded); } @@ -748,12 +762,15 @@ async function persistChatSendImages(params: { saved.push(inline); } } - for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) { - const offloaded = offloadedSaved[offloadedIndex]; + for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) { + const offloaded = imageOffloadedSaved[offloadedIndex]; if (offloaded) { saved.push(offloaded); } } + for (const offloaded of nonImageOffloadedSaved) { + saved.push(offloaded); + } return saved; } @@ -816,7 +833,7 @@ async function prestageNonImageOffloads(params: { MediaType: nonImage[0].mimeType, MediaTypes: nonImage.map((ref) => ref.mimeType), }; - await stageSandboxMedia({ + const stageResult = await stageSandboxMedia({ ctx: stagingCtx, sessionCtx: stagingCtx as TemplateContext, cfg: params.cfg, @@ -824,18 +841,26 @@ async function prestageNonImageOffloads(params: { workspaceDir, }); - const stagedPaths = stagingCtx.MediaPaths ?? []; - const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType); - if (stagedPaths.length !== nonImage.length) { + // stageSandboxMedia silently keeps unstaged entries as their original + // absolute path, so length parity with `nonImage` does not prove every + // file landed in the sandbox. The RPC max (20MB via + // resolveChatAttachmentMaxBytes) admits files above the staging cap + // (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any + // missing source becomes a 5xx MediaOffloadError the client can retry. + const stagedSources = stageResult.staged; + const missing = nonImage.filter((ref) => !stagedSources.has(ref.path)); + if (missing.length > 0) { throw new Error( - `non-image attachment staging incomplete: ${stagedPaths.length}/${nonImage.length} paths staged into sandbox workspace`, + `non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`, ); } + const stagedPaths = stagingCtx.MediaPaths ?? []; + const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType); - const absolutePaths = stagedPaths.map((p) => - path.isAbsolute(p) ? p : path.join(sandbox.workspaceDir, p), - ); - return { paths: absolutePaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir }; + // Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the + // agent inside the container can read them. Host-side media-understanding + // resolves them via ctx.MediaWorkspaceDir, which we carry separately. + return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir }; } catch (err) { await Promise.allSettled( params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")), diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 6a69fef139c..06acf6397e7 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: { const cache = createMediaAttachmentCache(attachments, { localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }), ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy, + workspaceDir: ctx.MediaWorkspaceDir, }); try { diff --git a/src/media-understanding/attachments.cache.ts b/src/media-understanding/attachments.cache.ts index d3f18ce6e7d..194093d61d9 100644 --- a/src/media-understanding/attachments.cache.ts +++ b/src/media-understanding/attachments.cache.ts @@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = { localPathRoots?: readonly string[]; includeDefaultLocalPathRoots?: boolean; ssrfPolicy?: SsrFPolicy; + workspaceDir?: string; }; function resolveRequestUrl(input: RequestInfo | URL): string { @@ -70,6 +71,7 @@ export class MediaAttachmentCache { private readonly attachments: MediaAttachment[]; private readonly localPathRoots: readonly string[]; private readonly ssrfPolicy: SsrFPolicy | undefined; + private readonly workspaceDir?: string; private canonicalLocalPathRoots?: Promise; constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) { @@ -79,6 +81,7 @@ export class MediaAttachmentCache { options?.includeDefaultLocalPathRoots === false ? mergeInboundPathRoots(options.localPathRoots) : mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots()); + this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined; for (const attachment of attachments) { this.entries.set(attachment.index, { attachment }); } @@ -293,7 +296,7 @@ export class MediaAttachmentCache { if (!rawPath) { return undefined; } - return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); + return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath); } private async ensureLocalStat(entry: AttachmentCacheEntry): Promise { diff --git a/src/media-understanding/media-understanding-misc.test.ts b/src/media-understanding/media-understanding-misc.test.ts index dd845db6e06..c5e953f1098 100644 --- a/src/media-understanding/media-understanding-misc.test.ts +++ b/src/media-understanding/media-understanding-misc.test.ts @@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => { }); }); + it("resolves relative attachment paths against the provided workspaceDir", async () => { + await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => { + const workspaceDir = path.join(base, "workspace"); + const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf"); + await fs.mkdir(path.dirname(attachmentPath), { recursive: true }); + await fs.writeFile(attachmentPath, "ok"); + + const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], { + localPathRoots: [workspaceDir], + workspaceDir, + }); + + const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 }); + expect(result.buffer.toString()).toBe("ok"); + }); + }); + it("blocks local attachments outside configured roots", async () => { if (process.platform === "win32") { return; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 36f1ae04868..001b810e59b 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: { cfg: OpenClawConfig; ctx: MsgContext; }): readonly string[] { + // ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when + // inbound attachments were staged into a sandbox workspace. The paths in + // ctx.MediaPaths are kept sandbox-relative (so the agent inside the + // container can read them), and the workspace dir is carried separately so + // host-side media-understanding can still resolve them via this root list. + const workspaceDir = params.ctx.MediaWorkspaceDir; return mergeInboundPathRoots( getDefaultMediaLocalRoots(), + workspaceDir ? [path.resolve(workspaceDir)] : undefined, resolveChannelInboundAttachmentRoots(params), ); }