diff --git a/CHANGELOG.md b/CHANGELOG.md index f321cc40991..12d3e26c1bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ Docs: https://docs.openclaw.ai ## Unreleased +### Changes + +- Gateway/chat: accept non-image attachments through `chat.send` by staging them as agent-readable media paths, while keeping unsupported RPC attachment paths explicit instead of silently dropping files. Fixes #48123. (#67572) Thanks @samzong. + ## 2026.4.27 ### Changes diff --git a/src/auto-reply/reply/get-reply.ts b/src/auto-reply/reply/get-reply.ts index ae1ab3c745a..8921ce2a048 100644 --- a/src/auto-reply/reply/get-reply.ts +++ b/src/auto-reply/reply/get-reply.ts @@ -613,7 +613,11 @@ export async function getReplyFromConfig( } } - if (!useFastTestBootstrap && sessionKey && hasInboundMedia(ctx)) { + // ctx.MediaStaged=true means the caller (e.g. chat.send RPC) already staged + // synchronously so it could surface 5xx before respond(). Skipping here keeps + // staging a single-call contract instead of relying on relative-path no-op + // semantics in stageSandboxMedia. + if (!useFastTestBootstrap && sessionKey && !ctx.MediaStaged && hasInboundMedia(ctx)) { const { stageSandboxMedia } = await loadStageSandboxMediaRuntime(); await stageSandboxMedia({ ctx, diff --git a/src/auto-reply/reply/stage-sandbox-media.ts b/src/auto-reply/reply/stage-sandbox-media.ts index 1689fdbbb6d..d10a92c80e2 100644 --- a/src/auto-reply/reply/stage-sandbox-media.ts +++ b/src/auto-reply/reply/stage-sandbox-media.ts @@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js"; const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES; +// `staged` maps every absolute source path that was copied into the sandbox +// (or remote cache) to its rewritten ctx path. Callers like chat.send's +// prestage use this to detect partial failures: unstaged sources keep their +// original absolute path in ctx.MediaPaths, so a length check against the +// input cannot distinguish "everything staged" from "silently skipped some" +// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the +// chat.send RPC already admitted under its 20MB cap). +export type StageSandboxMediaResult = { + staged: ReadonlyMap; +}; + +const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() }; + export async function stageSandboxMedia(params: { ctx: MsgContext; sessionCtx: TemplateContext; cfg: OpenClawConfig; sessionKey?: string; workspaceDir: string; -}) { +}): Promise { const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params; const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0; const rawPaths = resolveRawPaths(ctx); if (rawPaths.length === 0 || !sessionKey) { - return; + return EMPTY_STAGE_RESULT; } const sandbox = await ensureSandboxWorkspaceForSession({ @@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: { : null; const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir; if (!effectiveWorkspaceDir) { - return; + return EMPTY_STAGE_RESULT; } await fs.mkdir(effectiveWorkspaceDir, { recursive: true }); @@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: { staged, hasPathsArray, }); + + return { staged }; } async function stageLocalFileIntoRoot(params: { diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index 996b3475774..1dc0fe25865 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -123,8 +123,15 @@ export type MsgContext = { MediaPaths?: string[]; MediaUrls?: string[]; MediaTypes?: string[]; + MediaWorkspaceDir?: string; /** Attachment indexes whose audio was already transcribed before media understanding runs. */ MediaTranscribedIndexes?: number[]; + /** + * Marker: skip downstream stageSandboxMedia. chat.send RPC sets this so + * staging runs synchronously before respond() and surfaces 5xx to the + * client; any later failure only reaches the broadcast channel. + */ + MediaStaged?: boolean; /** Telegram sticker metadata (emoji, set name, file IDs, cached description). */ Sticker?: StickerContextMetadata; /** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */ diff --git a/src/gateway/chat-attachments.test.ts b/src/gateway/chat-attachments.test.ts index c7920754ab8..e34792a0f79 100644 --- a/src/gateway/chat-attachments.test.ts +++ b/src/gateway/chat-attachments.test.ts @@ -1,26 +1,66 @@ -import { describe, expect, it, vi } from "vitest"; -import { deleteMediaBuffer } from "../media/store.js"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const saveMediaBufferMock = vi.hoisted(() => + vi.fn(async (_buffer: Buffer, mime?: string, _subdir?: string) => ({ + id: `fake-id-${Math.random().toString(36).slice(2, 10)}`, + path: `/tmp/openclaw-test-media/inbound/fake.${mime?.split("/")[1] ?? "bin"}`, + size: 0, + contentType: mime, + })), +); +const deleteMediaBufferMock = vi.hoisted(() => + vi.fn(async (_id: string, _subdir?: string) => undefined), +); + +vi.mock("../media/store.js", async (importOriginal) => { + const actual = (await importOriginal()) as Record; + return { + ...actual, + saveMediaBuffer: saveMediaBufferMock, + deleteMediaBuffer: deleteMediaBufferMock, + }; +}); + +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { MAX_IMAGE_BYTES } from "../media/constants.js"; import { buildMessageWithAttachments, type ChatAttachment, + DEFAULT_CHAT_ATTACHMENT_MAX_MB, parseMessageWithAttachments, + resolveChatAttachmentMaxBytes, + UnsupportedAttachmentError, } from "./chat-attachments.js"; const PNG_1x1 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; -async function parseWithWarnings(message: string, attachments: ChatAttachment[]) { +async function parseWithWarnings( + message: string, + attachments: ChatAttachment[], + opts: Parameters[2] = {}, +) { const logs: string[] = []; const parsed = await parseMessageWithAttachments(message, attachments, { log: { warn: (warning) => logs.push(warning) }, + ...opts, }); return { parsed, logs }; } async function cleanupOffloadedRefs(refs: { id: string }[]) { - await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound"))); + await Promise.allSettled(refs.map((ref) => deleteMediaBufferMock(ref.id, "inbound"))); } +beforeEach(() => { + saveMediaBufferMock.mockClear(); + deleteMediaBufferMock.mockClear(); +}); + +afterEach(() => { + vi.restoreAllMocks(); +}); + describe("buildMessageWithAttachments", () => { it("embeds a single image as data URL", () => { const msg = buildMessageWithAttachments("see this", [ @@ -81,19 +121,43 @@ describe("parseMessageWithAttachments", () => { expect(logs).toHaveLength(0); }); - it("drops non-image payloads and logs", async () => { - const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); - const { parsed, logs } = await parseWithWarnings("x", [ + it("accepts non-image payloads and offloads them via the media store", async () => { + const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64"); + const { parsed, logs } = await parseWithWarnings("read this", [ { type: "file", - mimeType: "image/png", - fileName: "not-image.pdf", + mimeType: "application/pdf", + fileName: "report.pdf", content: pdf, }, ]); expect(parsed.images).toHaveLength(0); - expect(logs).toHaveLength(1); - expect(logs[0]).toMatch(/non-image/i); + expect(parsed.offloadedRefs).toHaveLength(1); + const ref = parsed.offloadedRefs[0]; + expect(ref.mimeType).toBe("application/pdf"); + expect(ref.label).toBe("report.pdf"); + expect(ref.mediaRef).toMatch(/^media:\/\/inbound\//); + // Non-image offloads MUST NOT inject a media://URI into the message — + // the caller is responsible for routing offloadedRefs[].path into + // ctx.MediaPaths so the workspace stage surfaces a real path. + expect(parsed.message).toBe("read this"); + expect(saveMediaBufferMock).toHaveBeenCalledOnce(); + expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/pdf"); + expect(logs).toHaveLength(0); + }); + + it("offloads opaque binary when sniff and provided mime are both absent", async () => { + const unknown = Buffer.from("just some bytes that do not match any signature").toString( + "base64", + ); + const { parsed, logs } = await parseWithWarnings("take a look", [ + { type: "file", fileName: "blob.dat", content: unknown }, + ]); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/octet-stream"); + expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/octet-stream"); + expect(parsed.message).toBe("take a look"); + expect(logs).toHaveLength(0); }); it("prefers sniffed mime type and logs mismatch", async () => { @@ -111,28 +175,9 @@ describe("parseMessageWithAttachments", () => { expect(logs[0]).toMatch(/mime mismatch/i); }); - it("persists unknown non-image files when sniff fails", async () => { - const unknown = Buffer.from("not an image").toString("base64"); - const { parsed, logs } = await parseWithWarnings("x", [ - { type: "file", fileName: "unknown.bin", content: unknown }, - ]); - try { - expect(parsed.images).toHaveLength(0); - expect(parsed.offloadedRefs).toHaveLength(1); - expect(parsed.offloadedRefs[0]).toMatchObject({ - label: "unknown.bin", - mimeType: "application/octet-stream", - }); - expect(parsed.message).toMatch(/^x\n\[media attached: media:\/\/inbound\//); - expect(logs).toHaveLength(0); - } finally { - await cleanupOffloadedRefs(parsed.offloadedRefs); - } - }); - - it("keeps valid images and drops invalid ones", async () => { + it("keeps image inline and offloads non-image side by side", async () => { const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); - const { parsed, logs } = await parseWithWarnings("x", [ + const { parsed } = await parseWithWarnings("x", [ { type: "image", mimeType: "image/png", @@ -141,15 +186,196 @@ describe("parseMessageWithAttachments", () => { }, { type: "file", - mimeType: "image/png", - fileName: "not-image.pdf", + mimeType: "application/pdf", + fileName: "report.pdf", content: pdf, }, ]); expect(parsed.images).toHaveLength(1); expect(parsed.images[0]?.mimeType).toBe("image/png"); - expect(parsed.images[0]?.data).toBe(PNG_1x1); - expect(logs.some((l) => /non-image/i.test(l))).toBe(true); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf"); + expect(parsed.imageOrder).toEqual(["inline"]); + }); + + it("excludes non-image offloads from imageOrder in mixed batches", async () => { + // Regression: a prior revision pushed "offloaded" for every offload, + // including non-image files. In a [non-image, inline, offloaded-image] + // batch that produced imageOrder=["offloaded","inline","offloaded"] even + // though only one `[media attached: media://...]` line is ever appended + // to the prompt (for the image offload). extractTrailingAttachmentMediaUris + // then read count=2 against one trailing URI, and + // mergePromptAttachmentImages placed the single offloaded image into the + // first "offloaded" slot — swapping it ahead of the inline image. + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + const bigPng = Buffer.alloc(2_100_000); + bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0); + const { parsed } = await parseWithWarnings("x", [ + { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf }, + { type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }, + { + type: "image", + mimeType: "image/png", + fileName: "big.png", + content: bigPng.toString("base64"), + }, + ]); + expect(parsed.images).toHaveLength(1); + expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([ + "application/pdf", + "image/png", + ]); + expect(parsed.imageOrder).toEqual(["inline", "offloaded"]); + // The offloaded-image URI is the sole trailing media:// line, matching + // imageOrder's single "offloaded" slot. + const trailingMediaLines = parsed.message + .split("\n") + .filter((line) => line.trim().startsWith("[media attached: media://inbound/")); + expect(trailingMediaLines).toHaveLength(1); + }); + + it("rejects oversized images before offload", async () => { + const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64"); + + await expect( + parseMessageWithAttachments( + "x", + [{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }], + { maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } }, + ), + ).rejects.toThrow(/image exceeds size limit/i); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); + }); + + it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => { + const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64"); + const { parsed } = await parseWithWarnings("x", [ + { + type: "file", + mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + fileName: "spec.docx", + content: docx, + }, + ]); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.label).toBe("spec.docx"); + // Docx sniffs as application/zip; the provided OOXML mime must win so the + // agent sees the real document type, not a generic archive. + expect(parsed.offloadedRefs[0]?.mimeType).toBe( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ); + }); + + it("recovers specific mime from filename extension when sniff is generic and provided mime is absent", async () => { + const xlsx = Buffer.from("PK\u0003\u0004fake-xlsx").toString("base64"); + const { parsed } = await parseWithWarnings("x", [ + { type: "file", fileName: "sheet.xlsx", content: xlsx }, + ]); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.mimeType).toBe( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ); + }); + + it("accepts zip attachments via workspace offload", async () => { + const zip = Buffer.from("PK\u0003\u0004zip-archive-bytes").toString("base64"); + const { parsed } = await parseWithWarnings("x", [ + { + type: "file", + mimeType: "application/zip", + fileName: "bundle.zip", + content: zip, + }, + ]); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.label).toBe("bundle.zip"); + expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/zip"); + }); +}); + +describe("parseMessageWithAttachments validation errors", () => { + it("throws UnsupportedAttachmentError on empty payload", async () => { + await expect( + parseMessageWithAttachments( + "x", + [{ type: "file", mimeType: "application/pdf", fileName: "empty.pdf", content: "" }], + { log: { warn: () => {} } }, + ), + ).rejects.toMatchObject({ + name: "UnsupportedAttachmentError", + reason: "empty-payload", + }); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); + }); + + it("throws UnsupportedAttachmentError on non-image when acceptNonImage is false", async () => { + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + let caught: unknown; + try { + await parseMessageWithAttachments( + "x", + [{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }], + { log: { warn: () => {} }, acceptNonImage: false }, + ); + } catch (err) { + caught = err; + } + expect(caught).toBeInstanceOf(UnsupportedAttachmentError); + expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image"); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); + }); + + it("rejects generic-container payloads mislabeled as images when acceptNonImage is false", async () => { + const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64"); + let caught: unknown; + try { + await parseMessageWithAttachments( + "x", + [{ type: "file", mimeType: "image/png", fileName: "report.docx", content: docx }], + { log: { warn: () => {} }, acceptNonImage: false }, + ); + } catch (err) { + caught = err; + } + expect(caught).toBeInstanceOf(UnsupportedAttachmentError); + expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image"); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); + }); + + it("throws UnsupportedAttachmentError on image when supportsInlineImages is false", async () => { + let caught: unknown; + try { + await parseMessageWithAttachments( + "x", + [{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }], + { log: { warn: () => {} }, supportsInlineImages: false }, + ); + } catch (err) { + caught = err; + } + expect(caught).toBeInstanceOf(UnsupportedAttachmentError); + expect((caught as UnsupportedAttachmentError).reason).toBe("text-only-image"); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); + }); + + it("still offloads non-image attachments when supportsInlineImages is false", async () => { + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + const { parsed } = await parseWithWarnings( + "x", + [{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }], + { supportsInlineImages: false }, + ); + expect(parsed.offloadedRefs).toHaveLength(1); + expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf"); + expect(saveMediaBufferMock).toHaveBeenCalledOnce(); + }); + + it("passes through unchanged on text-only session with no attachments", async () => { + const { parsed } = await parseWithWarnings("hello", [], { supportsInlineImages: false }); + expect(parsed.message).toBe("hello"); + expect(parsed.images).toHaveLength(0); + expect(parsed.offloadedRefs).toHaveLength(0); + expect(saveMediaBufferMock).not.toHaveBeenCalled(); }); it("persists non-image file attachments as media refs", async () => { @@ -168,13 +394,13 @@ describe("parseMessageWithAttachments", () => { try { expect(parsed.images).toHaveLength(0); - expect(parsed.imageOrder).toEqual(["offloaded"]); + expect(parsed.imageOrder).toEqual([]); expect(parsed.offloadedRefs).toHaveLength(1); expect(parsed.offloadedRefs[0]).toMatchObject({ mimeType: "application/pdf", label: "brief.pdf", }); - expect(parsed.message).toMatch(/^read this\n\[media attached: media:\/\/inbound\//); + expect(parsed.message).toBe("read this"); } finally { await cleanupOffloadedRefs(parsed.offloadedRefs); } @@ -258,6 +484,32 @@ describe("parseMessageWithAttachments", () => { }); }); +describe("resolveChatAttachmentMaxBytes", () => { + const MB = 1024 * 1024; + const DEFAULT_BYTES = DEFAULT_CHAT_ATTACHMENT_MAX_MB * MB; + + const cfgWithMediaMaxMb = (value: unknown): OpenClawConfig => + ({ agents: { defaults: { mediaMaxMb: value } } }) as unknown as OpenClawConfig; + + it("honours a configured agents.defaults.mediaMaxMb", () => { + expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(10))).toBe(10 * MB); + expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(50))).toBe(50 * MB); + }); + + it("falls back to DEFAULT_CHAT_ATTACHMENT_MAX_MB when unset", () => { + expect(resolveChatAttachmentMaxBytes({} as OpenClawConfig)).toBe(DEFAULT_BYTES); + expect(resolveChatAttachmentMaxBytes({ agents: {} } as unknown as OpenClawConfig)).toBe( + DEFAULT_BYTES, + ); + }); + + it("rejects non-positive, non-finite, or non-number values", () => { + for (const bad of [0, -5, Number.NaN, Number.POSITIVE_INFINITY, "50", null, undefined]) { + expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(bad))).toBe(DEFAULT_BYTES); + } + }); +}); + describe("shared attachment validation", () => { it("rejects invalid base64 content for both builder and parser", async () => { const bad: ChatAttachment = { diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 6a90d1e1798..01bbe30479f 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -1,12 +1,12 @@ +import type { OpenClawConfig } from "../config/types.openclaw.js"; import { formatErrorMessage } from "../infra/errors.js"; import { estimateBase64DecodedBytes } from "../media/base64.js"; +import { MAX_IMAGE_BYTES } from "../media/constants.js"; +import { extensionForMime, mimeTypeFromFilePath } from "../media/mime.js"; import type { PromptImageOrderEntry } from "../media/prompt-image-order.js"; import { sniffMimeFromBase64 } from "../media/sniff-mime-from-base64.js"; import { deleteMediaBuffer, saveMediaBuffer } from "../media/store.js"; -import { - normalizeLowercaseStringOrEmpty, - normalizeOptionalLowercaseString, -} from "../shared/string-coerce.js"; +import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; export type ChatAttachment = { type?: string; @@ -21,47 +21,19 @@ export type ChatImageContent = { mimeType: string; }; -/** - * Metadata for an attachment that was offloaded to the media store. - * - * Included in ParsedMessageWithImages.offloadedRefs so that callers can - * persist structured media metadata for transcripts. Without this, consumers - * that derive MediaPath/MediaPaths from the `images` array (e.g. - * persistChatSendImages and buildChatSendTranscriptMessage in chat.ts) would - * silently omit all large attachments that were offloaded to disk. - */ export type OffloadedRef = { - /** Opaque media URI injected into the message, e.g. "media://inbound/" */ mediaRef: string; - /** The raw media ID from SavedMedia.id, usable with resolveMediaBufferPath */ id: string; - /** Absolute filesystem path returned by saveMediaBuffer — used for transcript MediaPath */ path: string; - /** MIME type of the offloaded attachment */ mimeType: string; - /** The label / filename of the original attachment */ label: string; + sizeBytes: number; }; export type ParsedMessageWithImages = { message: string; - /** Small attachments (≤ OFFLOAD_THRESHOLD_BYTES) passed inline to the model */ images: ChatImageContent[]; - /** Original accepted attachment order after inline/offloaded split. */ imageOrder: PromptImageOrderEntry[]; - /** - * Large attachments (> OFFLOAD_THRESHOLD_BYTES) that were offloaded to the - * media store. Each entry corresponds to a `[media attached: media://inbound/]` - * marker appended to `message`. - * - * Callers MUST persist this list separately for transcript media metadata. - * It is intentionally separate from `images` because downstream model calls - * do not receive these as inline image blocks. - * - * ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass - * `supportsImages: modelSupportsImages(model)` so text-only model runs - * offload images as media refs instead of passing inline image blocks. - */ offloadedRefs: OffloadedRef[]; }; @@ -78,49 +50,38 @@ type NormalizedAttachment = { type SavedMedia = { id: string; - path?: string; + path: string; }; const OFFLOAD_THRESHOLD_BYTES = 2_000_000; const TEXT_ONLY_OFFLOAD_LIMIT = 10; -const MIME_TO_EXT: Record = { - "image/jpeg": ".jpg", - "image/jpg": ".jpg", - "image/png": ".png", - "image/webp": ".webp", - "image/gif": ".gif", - "image/heic": ".heic", - "image/heif": ".heif", - // bmp/tiff excluded from SUPPORTED_OFFLOAD_MIMES to avoid extension-loss - // bug in store.ts; entries kept here for future extension support - "image/bmp": ".bmp", - "image/tiff": ".tiff", -}; +export const DEFAULT_CHAT_ATTACHMENT_MAX_MB = 20; -// Module-level Set for O(1) lookup — not rebuilt on every attachment iteration. -// -// heic/heif are included only if store.ts's extensionForMime maps them to an -// extension. If it does not (same extension-loss risk as bmp/tiff), remove -// them from this set. -const SUPPORTED_OFFLOAD_MIMES = new Set([ - "image/jpeg", - "image/jpg", - "image/png", - "image/webp", - "image/gif", - "image/heic", - "image/heif", -]); +export function resolveChatAttachmentMaxBytes(cfg: OpenClawConfig): number { + const configured = cfg.agents?.defaults?.mediaMaxMb; + const mb = + typeof configured === "number" && Number.isFinite(configured) && configured > 0 + ? configured + : DEFAULT_CHAT_ATTACHMENT_MAX_MB; + return Math.floor(mb * 1024 * 1024); +} + +export type UnsupportedAttachmentReason = + | "empty-payload" + | "text-only-image" + | "unsupported-non-image" + | "non-image-too-large-for-sandbox"; + +export class UnsupportedAttachmentError extends Error { + readonly reason: UnsupportedAttachmentReason; + constructor(reason: UnsupportedAttachmentReason, message: string) { + super(message); + this.name = "UnsupportedAttachmentError"; + this.reason = reason; + } +} -/** - * Raised when the Gateway cannot persist an attachment to the media store. - * - * Distinct from ordinary input-validation errors so that Gateway handlers can - * map it to a server-side 5xx status rather than a client 4xx. - * - * Example causes: ENOSPC, EPERM, unexpected saveMediaBuffer return shape. - */ export class MediaOffloadError extends Error { readonly cause: unknown; constructor(message: string, options?: ErrorOptions) { @@ -142,42 +103,24 @@ function isImageMime(mime?: string): boolean { return typeof mime === "string" && mime.startsWith("image/"); } -function isVideoMime(mime?: string): boolean { - return typeof mime === "string" && mime.startsWith("video/"); +function isGenericContainerMime(mime?: string): boolean { + return mime === "application/zip" || mime === "application/octet-stream"; } -function isGenericMime(mime?: string): boolean { - return ( - !mime || - mime === "application/octet-stream" || - mime === "binary/octet-stream" || - mime === "application/unknown" - ); +function shouldIgnoreProvidedImageMime(params: { + sniffedMime?: string; + providedMime?: string; +}): boolean { + return isGenericContainerMime(params.sniffedMime) && isImageMime(params.providedMime); } function isValidBase64(value: string): boolean { if (value.length === 0 || value.length % 4 !== 0) { return false; } - // A full O(n) regex scan is safe: no overlapping quantifiers, fails linearly. - // Prevents adversarial payloads padded with megabytes of whitespace from - // bypassing length thresholds. return /^[A-Za-z0-9+/]+={0,2}$/.test(value); } -/** - * Confirms that the decoded buffer produced by Buffer.from(b64, 'base64') - * matches the pre-decode size estimate. - * - * Node's Buffer.from silently drops invalid base64 characters rather than - * throwing. A material size discrepancy means the source string contained - * embedded garbage that was silently stripped, which would produce a corrupted - * file on disk. ±3 bytes of leeway accounts for base64 padding rounding. - * - * IMPORTANT: this is an input-validation check (4xx client error). - * It MUST be called OUTSIDE the MediaOffloadError try/catch so that - * corrupt-input errors are not misclassified as 5xx server errors. - */ function verifyDecodedSize(buffer: Buffer, estimatedBytes: number, label: string): void { if (Math.abs(buffer.byteLength - estimatedBytes) > 3) { throw new Error( @@ -191,40 +134,34 @@ function ensureExtension(label: string, mime: string): string { if (/\.[a-zA-Z0-9]+$/.test(label)) { return label; } - const ext = MIME_TO_EXT[normalizeLowercaseStringOrEmpty(mime)] ?? ""; + const ext = extensionForMime(mime) ?? ""; return ext ? `${label}${ext}` : label; } -/** - * Type guard for the return value of saveMediaBuffer. - * - * Also validates that the returned ID: - * - is a non-empty string - * - contains no path separators (/ or \) or null bytes - * - * Catching a bad shape here produces a cleaner error than a cryptic failure - * deeper in the stack, and is treated as a 5xx infrastructure error. - */ function assertSavedMedia(value: unknown, label: string): SavedMedia { if ( - value !== null && - typeof value === "object" && - "id" in value && - typeof (value as Record).id === "string" + value === null || + typeof value !== "object" || + !("id" in value) || + typeof (value as Record).id !== "string" ) { - const id = (value as Record).id as string; - if (id.length === 0) { - throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`); - } - if (id.includes("/") || id.includes("\\") || id.includes("\0")) { - throw new Error( - `attachment ${label}: saveMediaBuffer returned an unsafe media ID ` + - `(contains path separator or null byte)`, - ); - } - return value as SavedMedia; + throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`); } - throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`); + const id = (value as Record).id as string; + if (id.length === 0) { + throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`); + } + if (id.includes("/") || id.includes("\\") || id.includes("\0")) { + throw new Error( + `attachment ${label}: saveMediaBuffer returned an unsafe media ID ` + + `(contains path separator or null byte)`, + ); + } + const path = (value as Record).path; + if (typeof path !== "string" || path.length === 0) { + throw new Error(`attachment ${label}: saveMediaBuffer returned no on-disk path`); + } + return { id, path }; } function normalizeAttachment( @@ -269,47 +206,22 @@ function validateAttachmentBase64OrThrow( return sizeBytes; } -/** - * Parse attachments and extract images as structured content blocks. - * Returns the message text, inline image blocks, and offloaded media refs. - * - * ## Offload behaviour - * Attachments whose decoded size exceeds OFFLOAD_THRESHOLD_BYTES are saved to - * disk via saveMediaBuffer and replaced with an opaque `media://inbound/` - * URI appended to the message. The agent resolves these URIs via - * resolveMediaBufferPath before passing them to the model. - * - * ## Transcript metadata - * Callers MUST use `result.offloadedRefs` to persist structured media metadata - * for transcripts. These refs are intentionally excluded from `result.images` - * because they are not passed inline to the model. - * - * ## Text-only model runs - * Pass `supportsImages: false` for text-only model runs so images are offloaded - * as `media://inbound/` refs instead of being sent as inline image blocks. - * The agent runner can then resolve the refs through the normal media path. - * - * ## Cleanup on failure - * On any parse failure after files have already been offloaded, best-effort - * cleanup is performed before rethrowing so that malformed requests do not - * accumulate orphaned files on disk ahead of the periodic TTL sweep. - * - * ## Known ordering limitation - * In mixed large/small batches, the model receives images in a different order - * than the original attachment list because detectAndLoadPromptImages - * initialises from existingImages first, then appends prompt-detected refs. - * A future refactor should unify all image references into a single ordered list. - * - * @throws {MediaOffloadError} Infrastructure failure saving to media store → 5xx. - * @throws {Error} Input validation failure → 4xx. - */ export async function parseMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, - opts?: { maxBytes?: number; log?: AttachmentLog; supportsImages?: boolean }, + opts?: { + maxBytes?: number; + log?: AttachmentLog; + supportsImages?: boolean; + supportsInlineImages?: boolean; + acceptNonImage?: boolean; + }, ): Promise { - const maxBytes = opts?.maxBytes ?? 5_000_000; + const maxBytes = opts?.maxBytes ?? DEFAULT_CHAT_ATTACHMENT_MAX_MB * 1024 * 1024; const log = opts?.log; + const shouldForceImageOffload = opts?.supportsImages === false; + const supportsInlineImages = opts?.supportsInlineImages !== false; + const acceptNonImage = opts?.acceptNonImage !== false; if (!attachments || attachments.length === 0) { return { message, images: [], imageOrder: [], offloadedRefs: [] }; @@ -319,11 +231,7 @@ export async function parseMessageWithAttachments( const imageOrder: PromptImageOrderEntry[] = []; const offloadedRefs: OffloadedRef[] = []; let updatedMessage = message; - const shouldForceOffload = opts?.supportsImages === false; let textOnlyImageOffloadCount = 0; - - // Track IDs of files saved during this request for cleanup if a later - // attachment fails validation and the entire parse is aborted. const savedMediaIds: string[] = []; try { @@ -339,16 +247,14 @@ export async function parseMessageWithAttachments( const { base64: b64, label, mime } = normalized; + if (b64.length === 0) { + throw new UnsupportedAttachmentError("empty-payload", `attachment ${label}: empty payload`); + } if (!isValidBase64(b64)) { throw new Error(`attachment ${label}: invalid base64 content`); } const sizeBytes = estimateBase64DecodedBytes(b64); - if (sizeBytes <= 0) { - log?.warn(`attachment ${label}: estimated size is zero, dropping`); - continue; - } - if (sizeBytes > maxBytes) { throw new Error( `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`, @@ -357,67 +263,74 @@ export async function parseMessageWithAttachments( const providedMime = normalizeMime(mime); const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64)); + const labelMime = normalizeMime(mimeTypeFromFilePath(label)); + const trustedProvidedMime = shouldIgnoreProvidedImageMime({ sniffedMime, providedMime }) + ? undefined + : providedMime; - if (sniffedMime && !isImageMime(sniffedMime) && isImageMime(providedMime)) { - log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`); - continue; - } + // Prefer specific MIME signals over generic container types. OOXML + // documents (docx/xlsx/pptx) sniff as application/zip; without this + // priority the agent would receive a `.zip` instead of the specific + // Office document the caller declared. + const finalMime = + (sniffedMime && !isGenericContainerMime(sniffedMime) && sniffedMime) || + (trustedProvidedMime && + !isGenericContainerMime(trustedProvidedMime) && + trustedProvidedMime) || + (labelMime && !isGenericContainerMime(labelMime) && labelMime) || + sniffedMime || + trustedProvidedMime || + labelMime || + "application/octet-stream"; - const shouldHandleAsImage = - isImageMime(sniffedMime) || (isImageMime(providedMime) && !sniffedMime); - if (!shouldHandleAsImage) { - const finalMime = sniffedMime ?? providedMime ?? "application/octet-stream"; - if (isVideoMime(finalMime)) { - log?.warn(`attachment ${label}: video attachments are not supported, dropping`); - continue; - } - - const buffer = Buffer.from(b64, "base64"); - verifyDecodedSize(buffer, sizeBytes, label); - - try { - const rawResult = await saveMediaBuffer(buffer, finalMime, "inbound", maxBytes, label); - const savedMedia = assertSavedMedia(rawResult, label); - savedMediaIds.push(savedMedia.id); - - const mediaRef = `media://inbound/${savedMedia.id}`; - updatedMessage += `\n[media attached: ${mediaRef}]`; - log?.info?.(`[Gateway] Saved file attachment. Saved: ${mediaRef}`); - offloadedRefs.push({ - mediaRef, - id: savedMedia.id, - path: savedMedia.path ?? "", - mimeType: finalMime, - label, - }); - imageOrder.push("offloaded"); - } catch (err) { - const errorMessage = formatErrorMessage(err); - throw new MediaOffloadError( - `[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`, - { cause: err }, - ); - } - continue; - } if ( sniffedMime && providedMime && - !isGenericMime(providedMime) && + !isGenericContainerMime(providedMime) && sniffedMime !== providedMime ) { + const usedSource = + finalMime === sniffedMime + ? "sniffed" + : finalMime === providedMime + ? "provided" + : "label-derived"; log?.warn( - `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`, + `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using ${usedSource}`, ); } - // Third fallback normalises `mime` so a raw un-normalised string (e.g. - // "IMAGE/JPEG") does not silently bypass the SUPPORTED_OFFLOAD_MIMES check. - const finalMime = sniffedMime ?? providedMime ?? normalizeMime(mime) ?? mime; + const isImage = isImageMime(finalMime); + if (isImage && !supportsInlineImages && !shouldForceImageOffload) { + throw new UnsupportedAttachmentError( + "text-only-image", + `attachment ${label}: active model does not accept image inputs`, + ); + } + if (!isImage && !acceptNonImage) { + throw new UnsupportedAttachmentError( + "unsupported-non-image", + `attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`, + ); + } + // Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF + // direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that + // would offload a file the runner later drops to null — a successful + // response with a silently missing image. Reject here so the client + // sees an explicit 4xx. Non-image attachments keep the full maxBytes + // ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't + // load into the model. + if (isImage && sizeBytes > MAX_IMAGE_BYTES) { + throw new Error( + `attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`, + ); + } - let isOffloaded = false; - - if (shouldForceOffload && textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT) { + if ( + shouldForceImageOffload && + isImage && + textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT + ) { log?.warn( `attachment ${label}: dropping image because text-only offload limit ` + `${TEXT_ONLY_OFFLOAD_LIMIT} was reached`, @@ -426,93 +339,64 @@ export async function parseMessageWithAttachments( continue; } - if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) { - const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime); + const shouldOffload = + shouldForceImageOffload || !isImage || sizeBytes > OFFLOAD_THRESHOLD_BYTES; - if (!isSupportedForOffload) { - if (shouldForceOffload) { - log?.warn( - `attachment ${label}: format ${finalMime} cannot be offloaded for ` + - "text-only model, dropping", - ); - continue; - } - // Passing this inline would reintroduce the OOM risk this PR prevents. - throw new Error( - `attachment ${label}: format ${finalMime} is too large to pass inline ` + - `(${sizeBytes} > ${OFFLOAD_THRESHOLD_BYTES} bytes) and cannot be offloaded. ` + - `Please convert to JPEG, PNG, WEBP, GIF, HEIC, or HEIF.`, - ); - } - - // Decode and run input-validation BEFORE the MediaOffloadError try/catch. - // verifyDecodedSize is a 4xx client error and must not be wrapped as a - // 5xx MediaOffloadError. - const buffer = Buffer.from(b64, "base64"); - verifyDecodedSize(buffer, sizeBytes, label); - - // Only the storage operation is wrapped so callers can distinguish - // infrastructure failures (5xx) from input errors (4xx). - try { - const labelWithExt = ensureExtension(label, finalMime); - - const rawResult = await saveMediaBuffer( - buffer, - finalMime, - "inbound", - maxBytes, - labelWithExt, - ); - - const savedMedia = assertSavedMedia(rawResult, label); - - // Track for cleanup if a subsequent attachment fails. - savedMediaIds.push(savedMedia.id); - - // Opaque URI — compatible with workspaceOnly sandboxes and decouples - // the Gateway from the agent's filesystem layout. - const mediaRef = `media://inbound/${savedMedia.id}`; - - updatedMessage += `\n[media attached: ${mediaRef}]`; - log?.info?.( - shouldForceOffload - ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}` - : `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`, - ); - - // Record for transcript metadata — separate from `images` because - // these are not passed inline to the model. - offloadedRefs.push({ - mediaRef, - id: savedMedia.id, - path: savedMedia.path ?? "", - mimeType: finalMime, - label, - }); - imageOrder.push("offloaded"); - if (shouldForceOffload) { - textOnlyImageOffloadCount++; - } - - isOffloaded = true; - } catch (err) { - const errorMessage = formatErrorMessage(err); - throw new MediaOffloadError( - `[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`, - { cause: err }, - ); - } - } - - if (isOffloaded) { + if (!shouldOffload) { + images.push({ type: "image", data: b64, mimeType: finalMime }); + imageOrder.push("inline"); continue; } - images.push({ type: "image", data: b64, mimeType: finalMime }); - imageOrder.push("inline"); + const buffer = Buffer.from(b64, "base64"); + verifyDecodedSize(buffer, sizeBytes, label); + + let savedMedia: SavedMedia; + try { + const labelWithExt = ensureExtension(label, finalMime); + const rawResult = await saveMediaBuffer( + buffer, + finalMime, + "inbound", + maxBytes, + labelWithExt, + ); + savedMedia = assertSavedMedia(rawResult, label); + } catch (err) { + throw new MediaOffloadError( + `[Gateway Error] Failed to save intercepted media to disk: ${formatErrorMessage(err)}`, + { cause: err }, + ); + } + + savedMediaIds.push(savedMedia.id); + + const mediaRef = `media://inbound/${savedMedia.id}`; + if (isImage) { + updatedMessage += `\n[media attached: ${mediaRef}]`; + } + log?.info?.( + shouldForceImageOffload && isImage + ? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}` + : `[Gateway] Offloaded attachment (${finalMime}). Saved: ${mediaRef}`, + ); + + offloadedRefs.push({ + mediaRef, + id: savedMedia.id, + path: savedMedia.path, + mimeType: finalMime, + label, + sizeBytes, + }); + if (isImage) { + imageOrder.push("offloaded"); + if (shouldForceImageOffload) { + textOnlyImageOffloadCount++; + } + } } } catch (err) { - // Best-effort cleanup before rethrowing. if (savedMediaIds.length > 0) { await Promise.allSettled(savedMediaIds.map((id) => deleteMediaBuffer(id, "inbound"))); } diff --git a/src/gateway/server-methods/agent.ts b/src/gateway/server-methods/agent.ts index aa061ce0c02..640df493eb4 100644 --- a/src/gateway/server-methods/agent.ts +++ b/src/gateway/server-methods/agent.ts @@ -79,7 +79,11 @@ import { } from "../../utils/message-channel.js"; import { resolveAssistantIdentity } from "../assistant-identity.js"; import { registerChatAbortController, resolveAgentRunExpiresAtMs } from "../chat-abort.js"; -import { MediaOffloadError, parseMessageWithAttachments } from "../chat-attachments.js"; +import { + MediaOffloadError, + parseMessageWithAttachments, + resolveChatAttachmentMaxBytes, +} from "../chat-attachments.js"; import { resolveAssistantAvatarUrl } from "../control-ui-shared.js"; import { ADMIN_SCOPE } from "../method-scopes.js"; import { GATEWAY_CLIENT_CAPS, hasGatewayClientCap } from "../protocol/client-info.js"; @@ -498,7 +502,7 @@ export const agentHandlers: GatewayRequestHandlers = { } const effectiveProvider = providerOverride || baseProvider; const effectiveModel = modelOverride || baseModel; - const supportsImages = await resolveGatewayModelSupportsImages({ + const supportsInlineImages = await resolveGatewayModelSupportsImages({ loadGatewayModelCatalog: context.loadGatewayModelCatalog, provider: effectiveProvider, model: effectiveModel, @@ -506,9 +510,13 @@ export const agentHandlers: GatewayRequestHandlers = { try { const parsed = await parseMessageWithAttachments(message, normalizedAttachments, { - maxBytes: 5_000_000, + maxBytes: resolveChatAttachmentMaxBytes(cfg), log: context.logGateway, - supportsImages, + supportsInlineImages, + // agent.run does not yet wire a ctx.MediaPaths stage path, so reject + // non-image attachments explicitly (UnsupportedAttachmentError) + // instead of saving them where the agent cannot reach them. + acceptNonImage: false, }); message = parsed.message.trim(); images = parsed.images; diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index 41154f2063c..f9a16040671 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -66,6 +66,15 @@ const mockState = vi.hoisted(() => ({ saveMediaWait: null as Promise | null, activeSaveMediaCalls: 0, maxActiveSaveMediaCalls: 0, + sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null, + stageSandboxMediaError: null as Error | null, + stagedRelativePaths: null as string[] | null, + // `unstagedSources` lets tests simulate partial staging failure: absolute + // source paths listed here are excluded from the returned `staged` map even + // though ctx still carries their rewritten paths. This mirrors how the real + // stageSandboxMedia silently skips over-cap files. + unstagedSources: null as string[] | null, + deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>, })); const bindingMocks = vi.hoisted(() => ({ @@ -216,11 +225,54 @@ vi.mock("../../sessions/transcript-events.js", () => ({ ), })); +vi.mock("../../agents/sandbox/context.js", async () => { + const original = await vi.importActual( + "../../agents/sandbox/context.js", + ); + return { + ...original, + ensureSandboxWorkspaceForSession: vi.fn(async () => mockState.sandboxWorkspace), + }; +}); + +vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({ + stageSandboxMedia: vi.fn( + async (params: { ctx: { MediaPaths?: string[]; MediaPath?: string } }) => { + if (mockState.stageSandboxMediaError) { + throw mockState.stageSandboxMediaError; + } + const staged = new Map(); + const originalPaths = params.ctx.MediaPaths ?? []; + if (mockState.stagedRelativePaths) { + const mapping = mockState.stagedRelativePaths; + params.ctx.MediaPaths = [...mapping]; + params.ctx.MediaPath = mapping[0]; + for (let i = 0; i < mapping.length; i += 1) { + const source = originalPaths[i]; + const dest = mapping[i]; + if (source && dest) { + staged.set(source, dest); + } + } + } + if (mockState.unstagedSources) { + for (const source of mockState.unstagedSources) { + staged.delete(source); + } + } + return { staged }; + }, + ), +})); + vi.mock("../../media/store.js", async () => { const original = await vi.importActual("../../media/store.js"); return { ...original, + deleteMediaBuffer: vi.fn(async (id: string, subdir?: string) => { + mockState.deleteMediaBufferCalls.push({ id, subdir }); + }), saveMediaBuffer: vi.fn(async (buffer: Buffer, contentType?: string, subdir?: string) => { mockState.activeSaveMediaCalls += 1; mockState.maxActiveSaveMediaCalls = Math.max( @@ -465,6 +517,11 @@ describe("chat directive tag stripping for non-streaming final payloads", () => mockState.maxActiveSaveMediaCalls = 0; bindingMocks.resolveByConversation.mockReset(); bindingMocks.resolveByConversation.mockReturnValue(null); + mockState.sandboxWorkspace = null; + mockState.stageSandboxMediaError = null; + mockState.stagedRelativePaths = null; + mockState.unstagedSources = null; + mockState.deleteMediaBufferCalls = []; }); it("registers tool-event recipients for clients advertising tool-events capability", async () => { @@ -1965,10 +2022,8 @@ describe("chat directive tag stripping for non-streaming final payloads", () => } | undefined; expect(mockState.lastDispatchImages).toBeUndefined(); - expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]); - expect(mockState.lastDispatchCtx?.Body).toMatch( - /^summarize this\n\[media attached: media:\/\/inbound\//, - ); + expect(mockState.lastDispatchImageOrder).toBeUndefined(); + expect(mockState.lastDispatchCtx?.Body).toBe("summarize this"); expect(mockState.savedMediaCalls).toEqual([ expect.objectContaining({ contentType: "application/pdf", @@ -1976,7 +2031,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => size: expect.any(Number), }), ]); - expect(message?.content).toMatch(/^summarize this\n\[media attached: media:\/\/inbound\//); + expect(message?.content).toBe("summarize this"); expect(message?.MediaPath).toBe("/tmp/chat-send-brief.pdf"); expect(message?.MediaPaths).toEqual(["/tmp/chat-send-brief.pdf"]); expect(message?.MediaType).toBe("application/pdf"); @@ -2335,6 +2390,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => ], }, expectBroadcast: false, + waitFor: "none", }); expect(mockState.lastDispatchImages).toBeUndefined(); @@ -2449,6 +2505,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => ], }, expectBroadcast: false, + waitFor: "none", }); expect(mockState.lastDispatchImages).toBeUndefined(); @@ -2461,6 +2518,291 @@ describe("chat directive tag stripping for non-streaming final payloads", () => ]); }); + it("routes non-image offloaded refs into ctx.MediaPaths + MediaTypes for chat.send", async () => { + createTranscriptFixture("openclaw-chat-send-non-image-ctx-media-paths-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" }, + ]; + const respond = vi.fn(); + const context = createChatContext(); + const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-non-image-ctx-media", + message: "read this", + requestParams: { + attachments: [ + { + type: "file", + mimeType: "application/pdf", + fileName: "report.pdf", + content: pdf, + }, + ], + }, + expectBroadcast: false, + }); + + expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([ + "/home/user/.openclaw/media/inbound/report.pdf", + ]); + expect(mockState.lastDispatchCtx?.MediaPath).toBe( + "/home/user/.openclaw/media/inbound/report.pdf", + ); + expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["application/pdf"]); + expect(mockState.lastDispatchCtx?.MediaType).toBe("application/pdf"); + // Non-image offloads MUST NOT inject a media://URI into the prompt body — + // they ride through ctx.MediaPaths so buildInboundMediaNote prepends the + // real path, avoiding duplicate media markers. + expect(mockState.lastDispatchCtx?.Body).not.toContain("media://"); + expect(mockState.lastDispatchCtx?.BodyForAgent).not.toContain("media://"); + expect(mockState.lastDispatchImages).toBeUndefined(); + // Marker replaces the implicit "relative-path no-op" coupling in + // get-reply.ts with an explicit skip contract. + expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true); + }); + + it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => { + createTranscriptFixture("openclaw-chat-send-non-image-absolutize-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" }, + ]; + mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" }; + mockState.stagedRelativePaths = ["media/inbound/report.pdf"]; + const respond = vi.fn(); + const context = createChatContext(); + const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-non-image-absolutize", + message: "read this", + requestParams: { + attachments: [ + { + type: "file", + mimeType: "application/pdf", + fileName: "report.pdf", + content: pdf, + }, + ], + }, + expectBroadcast: false, + }); + + expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]); + expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf"); + expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace"); + expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true); + }); + + it("wraps stageSandboxMedia infrastructure errors as 5xx UNAVAILABLE and cleans up media-store files", async () => { + createTranscriptFixture("openclaw-chat-send-stage-unavailable-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" }, + ]; + mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" }; + mockState.stageSandboxMediaError = Object.assign(new Error("ENOSPC: no space left on device"), { + code: "ENOSPC", + }); + const respond = vi.fn(); + const context = createChatContext(); + const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-stage-unavailable", + message: "read this", + requestParams: { + attachments: [ + { + type: "file", + mimeType: "application/pdf", + fileName: "report.pdf", + content: pdf, + }, + ], + }, + expectBroadcast: false, + waitFor: "none", + }); + + // Plain Error from stageSandboxMedia would be misclassified as INVALID_REQUEST + // by the outer catch. Wrapping it in MediaOffloadError routes it to UNAVAILABLE + // so the client retries instead of treating it as a bad request. + expect(mockState.lastDispatchCtx).toBeUndefined(); + expect(respond).toHaveBeenCalledTimes(1); + const [ok, payload, error] = respond.mock.calls[0] ?? []; + expect(ok).toBe(false); + expect(payload).toBeUndefined(); + expect(error?.code).toBe(ErrorCodes.UNAVAILABLE); + expect(error?.message ?? String(error)).toMatch(/ENOSPC|non-image attachments/i); + // Orphaned media-store files are cleaned up before the 5xx surfaces. + expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]); + }); + + it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => { + // Regression: stageSandboxMedia keeps unstaged entries as their original + // absolute path, so a simple `stagedPaths.length === nonImage.length` + // check could not detect when one of the files silently fell out (e.g. a + // file between the RPC cap and the staging cap). Prestage must compare + // the returned `staged` map against the input refs. + createTranscriptFixture("openclaw-chat-send-partial-stage-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" }, + { path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" }, + ]; + mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" }; + mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"]; + mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"]; + const respond = vi.fn(); + const context = createChatContext(); + const pdf = Buffer.from("%PDF-1.4\n").toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-partial-stage", + message: "read these", + requestParams: { + attachments: [ + { type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf }, + { type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf }, + ], + }, + expectBroadcast: false, + waitFor: "none", + }); + + expect(mockState.lastDispatchCtx).toBeUndefined(); + expect(respond).toHaveBeenCalledTimes(1); + const [ok, payload, error] = respond.mock.calls[0] ?? []; + expect(ok).toBe(false); + expect(payload).toBeUndefined(); + expect(error?.code).toBe(ErrorCodes.UNAVAILABLE); + expect(error?.message ?? String(error)).toMatch(/staging incomplete/i); + // Both media-store entries are cleaned up before the 5xx surfaces. + expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([ + "saved-media", + "saved-media", + ]); + }); + + it("rejects sandbox-oversized non-image attachments as 4xx before staging", async () => { + // Regression: resolveChatAttachmentMaxBytes defaults to 20MB, but + // stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (5MB) and + // silently drops oversize files. Without a pre-check, a sandbox session + // accepting a 5-20MB non-image would fail staging and surface as a + // retryable 5xx UNAVAILABLE, misleading clients into retrying a + // deterministically broken request. + createTranscriptFixture("openclaw-chat-send-sandbox-oversize-"); + mockState.finalText = "ok"; + mockState.sessionEntry = { + modelProvider: "test-provider", + model: "vision-model", + }; + mockState.modelCatalog = [ + { + provider: "test-provider", + id: "vision-model", + name: "Vision model", + input: ["text", "image"], + }, + ]; + mockState.savedMediaResults = [ + { path: "/home/user/.openclaw/media/inbound/huge.pdf", contentType: "application/pdf" }, + ]; + mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" }; + const respond = vi.fn(); + const context = createChatContext(); + // 6MB buffer — above STAGED_MEDIA_MAX_BYTES (5MB) but below the 20MB parse cap. + const oversized = Buffer.alloc(6 * 1024 * 1024); + oversized.set(Buffer.from("%PDF-1.4\n"), 0); + const pdf = oversized.toString("base64"); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-sandbox-oversize", + message: "read this", + requestParams: { + attachments: [ + { type: "file", mimeType: "application/pdf", fileName: "huge.pdf", content: pdf }, + ], + }, + expectBroadcast: false, + waitFor: "none", + }); + + expect(mockState.lastDispatchCtx).toBeUndefined(); + expect(respond).toHaveBeenCalledTimes(1); + const [ok, payload, error] = respond.mock.calls[0] ?? []; + expect(ok).toBe(false); + expect(payload).toBeUndefined(); + // 4xx, not 5xx — retrying a file that exceeds the staging cap cannot + // succeed, so the failure must be surfaced as a client-side rejection. + expect(error?.code).toBe(ErrorCodes.INVALID_REQUEST); + expect(error?.message ?? String(error)).toMatch(/sandbox staging limit/i); + // Orphaned media-store entries are cleaned up before the 4xx surfaces. + expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]); + }); + it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => { createTranscriptFixture("openclaw-chat-send-image-order-"); mockState.finalText = "ok"; diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index df2f3698e85..40e299aa5af 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -2,16 +2,20 @@ import fs from "node:fs"; import path from "node:path"; import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent"; import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload"; -import { resolveSessionAgentId } from "../../agents/agent-scope.js"; +import { resolveAgentWorkspaceDir, resolveSessionAgentId } from "../../agents/agent-scope.js"; import { resolveThinkingDefault } from "../../agents/model-selection.js"; import { rewriteTranscriptEntriesInSessionFile } from "../../agents/pi-embedded-runner/transcript-rewrite.js"; +import { ensureSandboxWorkspaceForSession } from "../../agents/sandbox/context.js"; import { resolveAgentTimeoutMs } from "../../agents/timeout.js"; import { dispatchInboundMessage } from "../../auto-reply/dispatch.js"; import type { ReplyPayload } from "../../auto-reply/reply-payload.js"; import { createReplyDispatcher } from "../../auto-reply/reply/reply-dispatcher.js"; -import type { MsgContext } from "../../auto-reply/templating.js"; +import { stageSandboxMedia } from "../../auto-reply/reply/stage-sandbox-media.js"; +import type { MsgContext, TemplateContext } from "../../auto-reply/templating.js"; import { extractCanvasFromText } from "../../chat/canvas-render.js"; import { resolveSessionFilePath } from "../../config/sessions.js"; +import type { OpenClawConfig } from "../../config/types.openclaw.js"; +import { formatErrorMessage } from "../../infra/errors.js"; import { jsonUtf8Bytes } from "../../infra/json-utf8-bytes.js"; import { normalizeReplyPayloadsForDelivery } from "../../infra/outbound/payloads.js"; import { getSessionBindingService } from "../../infra/outbound/session-binding-service.js"; @@ -22,7 +26,12 @@ import { } from "../../media/local-roots.js"; import { isAudioFileName } from "../../media/mime.js"; import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js"; -import { type SavedMedia, saveMediaBuffer } from "../../media/store.js"; +import { + deleteMediaBuffer, + MEDIA_MAX_BYTES, + type SavedMedia, + saveMediaBuffer, +} from "../../media/store.js"; import { createChannelReplyPipeline } from "../../plugin-sdk/channel-reply-pipeline.js"; import { isPluginOwnedSessionBindingRecord } from "../../plugins/conversation-binding.js"; import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js"; @@ -48,10 +57,12 @@ import { } from "../chat-abort.js"; import { type ChatImageContent, + MediaOffloadError, type OffloadedRef, parseMessageWithAttachments, + resolveChatAttachmentMaxBytes, + UnsupportedAttachmentError, } from "../chat-attachments.js"; -import { MediaOffloadError } from "../chat-attachments.js"; import { isToolHistoryBlockType, projectChatDisplayMessage, @@ -712,14 +723,28 @@ async function persistChatSendImages(params: { ); } } - const offloadedSaved = params.offloadedRefs.map((ref) => ({ - id: ref.id, - path: ref.path, - size: 0, - contentType: ref.mimeType, - })); + // imageOrder now only tracks image slots (see chat-attachments.ts), so split + // offloaded refs by mime: image offloads interleave with inline images via + // imageOrder, and non-image offloads append to the transcript tail. Without + // this split a non-image file would consume the next image slot whenever + // both kinds appear in the same request. + const imageOffloadedSaved: SavedMedia[] = []; + const nonImageOffloadedSaved: SavedMedia[] = []; + for (const ref of params.offloadedRefs) { + const entry: SavedMedia = { + id: ref.id, + path: ref.path, + size: 0, + contentType: ref.mimeType, + }; + if (ref.mimeType.startsWith("image/")) { + imageOffloadedSaved.push(entry); + } else { + nonImageOffloadedSaved.push(entry); + } + } if (params.imageOrder.length === 0) { - return [...inlineSaved, ...offloadedSaved]; + return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved]; } const saved: SavedMedia[] = []; let inlineIndex = 0; @@ -732,7 +757,7 @@ async function persistChatSendImages(params: { } continue; } - const offloaded = offloadedSaved[offloadedIndex++]; + const offloaded = imageOffloadedSaved[offloadedIndex++]; if (offloaded) { saved.push(offloaded); } @@ -743,12 +768,15 @@ async function persistChatSendImages(params: { saved.push(inline); } } - for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) { - const offloaded = offloadedSaved[offloadedIndex]; + for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) { + const offloaded = imageOffloadedSaved[offloadedIndex]; if (offloaded) { saved.push(offloaded); } } + for (const offloaded of nonImageOffloadedSaved) { + saved.push(offloaded); + } return saved; } @@ -766,6 +794,115 @@ function buildChatSendTranscriptMessage(params: { }; } +// Stages non-image offloads into the agent sandbox synchronously so chat.send +// can surface 5xx before respond(). Throws MediaOffloadError on any staging +// failure (ENOSPC / EPERM / partial-stage) so the outer chat.send handler can +// map it to UNAVAILABLE (5xx); plain Error would be misclassified as 4xx. All +// offloaded refs are cleaned up from the media store before rethrow. +// Callers MUST set ctx.MediaStaged=true when this runs so the dispatch +// pipeline skips its own stageSandboxMedia pass. +// +// Returned paths are ABSOLUTE (pointing into the sandbox workspace when sandbox +// is enabled, or the media-store origin when it is not). applyMediaUnderstanding +// runs before any further staging in get-reply.ts and uses +// `path.isAbsolute(raw) ? raw : path.resolve(raw)` against the gateway CWD, so +// any relative path here would make media-understanding target the wrong host +// path and silently skip file analysis. +async function prestageNonImageOffloads(params: { + offloadedRefs: OffloadedRef[]; + cfg: OpenClawConfig; + sessionKey: string; + agentId: string; +}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> { + const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/")); + if (nonImage.length === 0) { + return { paths: [], types: [] }; + } + + try { + const workspaceDir = resolveAgentWorkspaceDir(params.cfg, params.agentId); + const sandbox = await ensureSandboxWorkspaceForSession({ + config: params.cfg, + sessionKey: params.sessionKey, + workspaceDir, + }); + if (!sandbox) { + return { + paths: nonImage.map((ref) => ref.path), + types: nonImage.map((ref) => ref.mimeType), + }; + } + + // stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (= + // MEDIA_MAX_BYTES, 5MB) and silently skips oversized files. The parse cap + // (resolveChatAttachmentMaxBytes, default 20MB) is higher, so a sandboxed + // session receiving a non-image file between the two caps would otherwise + // pass parse, fail staging, and surface as a retryable 5xx even though + // retry cannot succeed. Reject here as a client-side 4xx instead. + const oversizedForSandbox = nonImage.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES); + if (oversizedForSandbox.length > 0) { + const details = oversizedForSandbox + .map((ref) => `${ref.label} (${ref.sizeBytes} bytes)`) + .join(", "); + throw new UnsupportedAttachmentError( + "non-image-too-large-for-sandbox", + `non-image attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`, + ); + } + + const stagingCtx: MsgContext = { + MediaPath: nonImage[0].path, + MediaPaths: nonImage.map((ref) => ref.path), + MediaType: nonImage[0].mimeType, + MediaTypes: nonImage.map((ref) => ref.mimeType), + }; + const stageResult = await stageSandboxMedia({ + ctx: stagingCtx, + sessionCtx: stagingCtx as TemplateContext, + cfg: params.cfg, + sessionKey: params.sessionKey, + workspaceDir, + }); + + // stageSandboxMedia silently keeps unstaged entries as their original + // absolute path, so length parity with `nonImage` does not prove every + // file landed in the sandbox. The RPC max (20MB via + // resolveChatAttachmentMaxBytes) admits files above the staging cap + // (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any + // missing source becomes a 5xx MediaOffloadError the client can retry. + const stagedSources = stageResult.staged; + const missing = nonImage.filter((ref) => !stagedSources.has(ref.path)); + if (missing.length > 0) { + throw new Error( + `non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`, + ); + } + const stagedPaths = stagingCtx.MediaPaths ?? []; + const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType); + + // Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the + // agent inside the container can read them. Host-side media-understanding + // resolves them via ctx.MediaWorkspaceDir, which we carry separately. + return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir }; + } catch (err) { + await Promise.allSettled( + params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")), + ); + if (err instanceof MediaOffloadError) { + throw err; + } + // Sandbox-oversize rejections are client-side 4xx (see check above). Wrapping + // them as MediaOffloadError would misclassify them as retryable 5xx. + if (err instanceof UnsupportedAttachmentError) { + throw err; + } + throw new MediaOffloadError( + `[Gateway Error] Failed to stage non-image attachments into agent workspace: ${formatErrorMessage(err)}`, + { cause: err }, + ); + } +} + function resolveChatSendTranscriptMediaFields(savedImages: SavedMedia[]) { const mediaPaths = savedImages.map((entry) => entry.path); if (mediaPaths.length === 0) { @@ -1759,6 +1896,9 @@ export const chatHandlers: GatewayRequestHandlers = { let parsedImages: ChatImageContent[] = []; let imageOrder: PromptImageOrderEntry[] = []; let offloadedRefs: OffloadedRef[] = []; + let nonImageMediaPaths: string[] = []; + let nonImageMediaTypes: string[] = []; + let nonImageMediaWorkspaceDir: string | undefined; const timeoutMs = resolveAgentTimeoutMs({ cfg, overrideMs: p.timeoutMs, @@ -1833,14 +1973,27 @@ export const chatHandlers: GatewayRequestHandlers = { explicitOriginTargetsPlugin; try { const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, { - maxBytes: 5_000_000, + maxBytes: resolveChatAttachmentMaxBytes(cfg), log: context.logGateway, supportsImages, + // chat.send routes non-image offloadedRefs into ctx.MediaPaths below + // so the auto-reply stage pipeline can surface them to the agent. + acceptNonImage: true, }); parsedMessage = parsed.message; parsedImages = parsed.images; imageOrder = parsed.imageOrder; offloadedRefs = parsed.offloadedRefs; + ({ + paths: nonImageMediaPaths, + types: nonImageMediaTypes, + workspaceDir: nonImageMediaWorkspaceDir, + } = await prestageNonImageOffloads({ + offloadedRefs, + cfg, + sessionKey, + agentId, + })); } catch (err) { respond( false, @@ -1947,6 +2100,19 @@ export const chatHandlers: GatewayRequestHandlers = { GatewayClientScopes: client?.connect?.scopes ?? [], ...pluginBoundMediaFields, }; + if (nonImageMediaPaths.length > 0) { + // Inject non-image offloads via the same MsgContext fields the channel + // path uses so buildInboundMediaNote renders a real `[media attached: + // ]` line into the agent prompt. Marker + // blocks the dispatch pipeline from re-running stageSandboxMedia; see + // prestageNonImageOffloads. + ctx.MediaPath = nonImageMediaPaths[0]; + ctx.MediaPaths = nonImageMediaPaths; + ctx.MediaType = nonImageMediaTypes[0]; + ctx.MediaTypes = nonImageMediaTypes; + ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir; + ctx.MediaStaged = true; + } const { onModelSelected, ...replyPipeline } = createChannelReplyPipeline({ cfg, diff --git a/src/gateway/server-node-events.runtime.ts b/src/gateway/server-node-events.runtime.ts index 01697810d6a..fa501faccb0 100644 --- a/src/gateway/server-node-events.runtime.ts +++ b/src/gateway/server-node-events.runtime.ts @@ -15,7 +15,7 @@ export { enqueueSystemEvent } from "../infra/system-events.js"; export { deleteMediaBuffer } from "../media/store.js"; export { normalizeMainKey, scopedHeartbeatWakeOptions } from "../routing/session-key.js"; export { defaultRuntime } from "../runtime.js"; -export { parseMessageWithAttachments } from "./chat-attachments.js"; +export { parseMessageWithAttachments, resolveChatAttachmentMaxBytes } from "./chat-attachments.js"; export { normalizeRpcAttachmentsToChatAttachments } from "./server-methods/attachment-normalize.js"; export { loadSessionEntry, diff --git a/src/gateway/server-node-events.test.ts b/src/gateway/server-node-events.test.ts index d4095b62a14..b084e7e05eb 100644 --- a/src/gateway/server-node-events.test.ts +++ b/src/gateway/server-node-events.test.ts @@ -90,6 +90,7 @@ const runtimeMocks = vi.hoisted(() => ({ parseMessageWithAttachments: parseMessageWithAttachmentsMock, registerApnsRegistration: registerApnsRegistrationMock, requestHeartbeatNow: vi.fn(), + resolveChatAttachmentMaxBytes: vi.fn(() => 20 * 1024 * 1024), resolveGatewayModelSupportsImages: vi.fn( async ({ loadGatewayModelCatalog, @@ -923,7 +924,7 @@ describe("agent request events", () => { expect(opts.runId).toBe(opts.sessionId); }); - it("passes supportsImages false for text-only node-session models", async () => { + it("passes supportsInlineImages false for text-only node-session models", async () => { const ctx = buildCtx(); ctx.loadGatewayModelCatalog = async () => [ { @@ -960,7 +961,41 @@ describe("agent request events", () => { expect(parseMessageWithAttachmentsMock).toHaveBeenCalledWith( "describe", expect.any(Array), - expect.objectContaining({ supportsImages: false }), + expect.objectContaining({ supportsInlineImages: false }), ); }); + + it("declines non-image attachments cleanly when parse throws UnsupportedAttachmentError", async () => { + const warn = vi.fn(); + const ctx = buildCtx(); + ctx.logGateway = { warn }; + + parseMessageWithAttachmentsMock.mockRejectedValueOnce( + Object.assign(new Error("attachment a.pdf: non-image attachments not supported"), { + name: "UnsupportedAttachmentError", + reason: "unsupported-non-image", + }), + ); + + await handleNodeEvent(ctx, "node-non-image-refusal", { + event: "agent.request", + payloadJSON: JSON.stringify({ + message: "read this", + sessionKey: "agent:main:main", + attachments: [ + { + type: "file", + mimeType: "application/pdf", + fileName: "a.pdf", + content: "JVBERi0=", + }, + ], + }), + }); + + // server-node-events must log-and-return on parse failure — no agent + // dispatch, no crash, and the refusal reason bubbles up via logGateway. + expect(agentCommandMock).not.toHaveBeenCalled(); + expect(warn).toHaveBeenCalledWith(expect.stringMatching(/attachment parse failed.*non-image/i)); + }); }); diff --git a/src/gateway/server-node-events.ts b/src/gateway/server-node-events.ts index e763af3a746..31bcfc1595f 100644 --- a/src/gateway/server-node-events.ts +++ b/src/gateway/server-node-events.ts @@ -26,6 +26,7 @@ import { parseMessageWithAttachments, registerApnsRegistration, requestHeartbeatNow, + resolveChatAttachmentMaxBytes, resolveGatewayModelSupportsImages, resolveOutboundTarget, resolveSessionAgentId, @@ -427,16 +428,20 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt if (normalizedAttachments.length > 0) { const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg }); const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId); - const supportsImages = await resolveGatewayModelSupportsImages({ + const supportsInlineImages = await resolveGatewayModelSupportsImages({ loadGatewayModelCatalog: ctx.loadGatewayModelCatalog, provider: modelRef.provider, model: modelRef.model, }); try { const parsed = await parseMessageWithAttachments(message, normalizedAttachments, { - maxBytes: 5_000_000, + maxBytes: resolveChatAttachmentMaxBytes(cfg), log: ctx.logGateway, - supportsImages, + supportsInlineImages, + // server-node-events dispatches via agentCommandFromIngress which + // has no ctx.MediaPaths wiring; reject non-image attachments + // explicitly rather than saving them where the agent cannot reach them. + acceptNonImage: false, }); message = parsed.message.trim(); images = parsed.images; diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 6a69fef139c..06acf6397e7 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: { const cache = createMediaAttachmentCache(attachments, { localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }), ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy, + workspaceDir: ctx.MediaWorkspaceDir, }); try { diff --git a/src/media-understanding/attachments.cache.ts b/src/media-understanding/attachments.cache.ts index d3f18ce6e7d..194093d61d9 100644 --- a/src/media-understanding/attachments.cache.ts +++ b/src/media-understanding/attachments.cache.ts @@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = { localPathRoots?: readonly string[]; includeDefaultLocalPathRoots?: boolean; ssrfPolicy?: SsrFPolicy; + workspaceDir?: string; }; function resolveRequestUrl(input: RequestInfo | URL): string { @@ -70,6 +71,7 @@ export class MediaAttachmentCache { private readonly attachments: MediaAttachment[]; private readonly localPathRoots: readonly string[]; private readonly ssrfPolicy: SsrFPolicy | undefined; + private readonly workspaceDir?: string; private canonicalLocalPathRoots?: Promise; constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) { @@ -79,6 +81,7 @@ export class MediaAttachmentCache { options?.includeDefaultLocalPathRoots === false ? mergeInboundPathRoots(options.localPathRoots) : mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots()); + this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined; for (const attachment of attachments) { this.entries.set(attachment.index, { attachment }); } @@ -293,7 +296,7 @@ export class MediaAttachmentCache { if (!rawPath) { return undefined; } - return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath); + return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath); } private async ensureLocalStat(entry: AttachmentCacheEntry): Promise { diff --git a/src/media-understanding/media-understanding-misc.test.ts b/src/media-understanding/media-understanding-misc.test.ts index dd845db6e06..c5e953f1098 100644 --- a/src/media-understanding/media-understanding-misc.test.ts +++ b/src/media-understanding/media-understanding-misc.test.ts @@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => { }); }); + it("resolves relative attachment paths against the provided workspaceDir", async () => { + await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => { + const workspaceDir = path.join(base, "workspace"); + const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf"); + await fs.mkdir(path.dirname(attachmentPath), { recursive: true }); + await fs.writeFile(attachmentPath, "ok"); + + const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], { + localPathRoots: [workspaceDir], + workspaceDir, + }); + + const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 }); + expect(result.buffer.toString()).toBe("ok"); + }); + }); + it("blocks local attachments outside configured roots", async () => { if (process.platform === "win32") { return; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 36f1ae04868..001b810e59b 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: { cfg: OpenClawConfig; ctx: MsgContext; }): readonly string[] { + // ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when + // inbound attachments were staged into a sandbox workspace. The paths in + // ctx.MediaPaths are kept sandbox-relative (so the agent inside the + // container can read them), and the workspace dir is carried separately so + // host-side media-understanding can still resolve them via this root list. + const workspaceDir = params.ctx.MediaWorkspaceDir; return mergeInboundPathRoots( getDefaultMediaLocalRoots(), + workspaceDir ? [path.resolve(workspaceDir)] : undefined, resolveChannelInboundAttachmentRoots(params), ); }