mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:50:43 +00:00
[Feat] Gateway: offload non-image attachments on chat.send (#67572)
Merged via squash.
Prepared head SHA: ecbd27fc30
Co-authored-by: samzong <13782141+samzong@users.noreply.github.com>
Co-authored-by: frankekn <4488090+frankekn@users.noreply.github.com>
Reviewed-by: @frankekn
This commit is contained in:
@@ -4,6 +4,10 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
## Unreleased
|
||||
|
||||
### Changes
|
||||
|
||||
- Gateway/chat: accept non-image attachments through `chat.send` by staging them as agent-readable media paths, while keeping unsupported RPC attachment paths explicit instead of silently dropping files. Fixes #48123. (#67572) Thanks @samzong.
|
||||
|
||||
## 2026.4.27
|
||||
|
||||
### Changes
|
||||
|
||||
@@ -613,7 +613,11 @@ export async function getReplyFromConfig(
|
||||
}
|
||||
}
|
||||
|
||||
if (!useFastTestBootstrap && sessionKey && hasInboundMedia(ctx)) {
|
||||
// ctx.MediaStaged=true means the caller (e.g. chat.send RPC) already staged
|
||||
// synchronously so it could surface 5xx before respond(). Skipping here keeps
|
||||
// staging a single-call contract instead of relying on relative-path no-op
|
||||
// semantics in stageSandboxMedia.
|
||||
if (!useFastTestBootstrap && sessionKey && !ctx.MediaStaged && hasInboundMedia(ctx)) {
|
||||
const { stageSandboxMedia } = await loadStageSandboxMediaRuntime();
|
||||
await stageSandboxMedia({
|
||||
ctx,
|
||||
|
||||
@@ -19,18 +19,31 @@ import type { MsgContext, TemplateContext } from "../templating.js";
|
||||
|
||||
const STAGED_MEDIA_MAX_BYTES = MEDIA_MAX_BYTES;
|
||||
|
||||
// `staged` maps every absolute source path that was copied into the sandbox
|
||||
// (or remote cache) to its rewritten ctx path. Callers like chat.send's
|
||||
// prestage use this to detect partial failures: unstaged sources keep their
|
||||
// original absolute path in ctx.MediaPaths, so a length check against the
|
||||
// input cannot distinguish "everything staged" from "silently skipped some"
|
||||
// (e.g. the 5MB cap in STAGED_MEDIA_MAX_BYTES rejecting files that the
|
||||
// chat.send RPC already admitted under its 20MB cap).
|
||||
export type StageSandboxMediaResult = {
|
||||
staged: ReadonlyMap<string, string>;
|
||||
};
|
||||
|
||||
const EMPTY_STAGE_RESULT: StageSandboxMediaResult = { staged: new Map() };
|
||||
|
||||
export async function stageSandboxMedia(params: {
|
||||
ctx: MsgContext;
|
||||
sessionCtx: TemplateContext;
|
||||
cfg: OpenClawConfig;
|
||||
sessionKey?: string;
|
||||
workspaceDir: string;
|
||||
}) {
|
||||
}): Promise<StageSandboxMediaResult> {
|
||||
const { ctx, sessionCtx, cfg, sessionKey, workspaceDir } = params;
|
||||
const hasPathsArray = Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0;
|
||||
const rawPaths = resolveRawPaths(ctx);
|
||||
if (rawPaths.length === 0 || !sessionKey) {
|
||||
return;
|
||||
return EMPTY_STAGE_RESULT;
|
||||
}
|
||||
|
||||
const sandbox = await ensureSandboxWorkspaceForSession({
|
||||
@@ -45,7 +58,7 @@ export async function stageSandboxMedia(params: {
|
||||
: null;
|
||||
const effectiveWorkspaceDir = sandbox?.workspaceDir ?? remoteMediaCacheDir;
|
||||
if (!effectiveWorkspaceDir) {
|
||||
return;
|
||||
return EMPTY_STAGE_RESULT;
|
||||
}
|
||||
|
||||
await fs.mkdir(effectiveWorkspaceDir, { recursive: true });
|
||||
@@ -116,6 +129,8 @@ export async function stageSandboxMedia(params: {
|
||||
staged,
|
||||
hasPathsArray,
|
||||
});
|
||||
|
||||
return { staged };
|
||||
}
|
||||
|
||||
async function stageLocalFileIntoRoot(params: {
|
||||
|
||||
@@ -123,8 +123,15 @@ export type MsgContext = {
|
||||
MediaPaths?: string[];
|
||||
MediaUrls?: string[];
|
||||
MediaTypes?: string[];
|
||||
MediaWorkspaceDir?: string;
|
||||
/** Attachment indexes whose audio was already transcribed before media understanding runs. */
|
||||
MediaTranscribedIndexes?: number[];
|
||||
/**
|
||||
* Marker: skip downstream stageSandboxMedia. chat.send RPC sets this so
|
||||
* staging runs synchronously before respond() and surfaces 5xx to the
|
||||
* client; any later failure only reaches the broadcast channel.
|
||||
*/
|
||||
MediaStaged?: boolean;
|
||||
/** Telegram sticker metadata (emoji, set name, file IDs, cached description). */
|
||||
Sticker?: StickerContextMetadata;
|
||||
/** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */
|
||||
|
||||
@@ -1,26 +1,66 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { deleteMediaBuffer } from "../media/store.js";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const saveMediaBufferMock = vi.hoisted(() =>
|
||||
vi.fn(async (_buffer: Buffer, mime?: string, _subdir?: string) => ({
|
||||
id: `fake-id-${Math.random().toString(36).slice(2, 10)}`,
|
||||
path: `/tmp/openclaw-test-media/inbound/fake.${mime?.split("/")[1] ?? "bin"}`,
|
||||
size: 0,
|
||||
contentType: mime,
|
||||
})),
|
||||
);
|
||||
const deleteMediaBufferMock = vi.hoisted(() =>
|
||||
vi.fn(async (_id: string, _subdir?: string) => undefined),
|
||||
);
|
||||
|
||||
vi.mock("../media/store.js", async (importOriginal) => {
|
||||
const actual = (await importOriginal()) as Record<string, unknown>;
|
||||
return {
|
||||
...actual,
|
||||
saveMediaBuffer: saveMediaBufferMock,
|
||||
deleteMediaBuffer: deleteMediaBufferMock,
|
||||
};
|
||||
});
|
||||
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import { MAX_IMAGE_BYTES } from "../media/constants.js";
|
||||
import {
|
||||
buildMessageWithAttachments,
|
||||
type ChatAttachment,
|
||||
DEFAULT_CHAT_ATTACHMENT_MAX_MB,
|
||||
parseMessageWithAttachments,
|
||||
resolveChatAttachmentMaxBytes,
|
||||
UnsupportedAttachmentError,
|
||||
} from "./chat-attachments.js";
|
||||
|
||||
const PNG_1x1 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
|
||||
|
||||
async function parseWithWarnings(message: string, attachments: ChatAttachment[]) {
|
||||
async function parseWithWarnings(
|
||||
message: string,
|
||||
attachments: ChatAttachment[],
|
||||
opts: Parameters<typeof parseMessageWithAttachments>[2] = {},
|
||||
) {
|
||||
const logs: string[] = [];
|
||||
const parsed = await parseMessageWithAttachments(message, attachments, {
|
||||
log: { warn: (warning) => logs.push(warning) },
|
||||
...opts,
|
||||
});
|
||||
return { parsed, logs };
|
||||
}
|
||||
|
||||
async function cleanupOffloadedRefs(refs: { id: string }[]) {
|
||||
await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
|
||||
await Promise.allSettled(refs.map((ref) => deleteMediaBufferMock(ref.id, "inbound")));
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
saveMediaBufferMock.mockClear();
|
||||
deleteMediaBufferMock.mockClear();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
describe("buildMessageWithAttachments", () => {
|
||||
it("embeds a single image as data URL", () => {
|
||||
const msg = buildMessageWithAttachments("see this", [
|
||||
@@ -81,19 +121,43 @@ describe("parseMessageWithAttachments", () => {
|
||||
expect(logs).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("drops non-image payloads and logs", async () => {
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
const { parsed, logs } = await parseWithWarnings("x", [
|
||||
it("accepts non-image payloads and offloads them via the media store", async () => {
|
||||
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
|
||||
const { parsed, logs } = await parseWithWarnings("read this", [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "image/png",
|
||||
fileName: "not-image.pdf",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "report.pdf",
|
||||
content: pdf,
|
||||
},
|
||||
]);
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(logs).toHaveLength(1);
|
||||
expect(logs[0]).toMatch(/non-image/i);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
const ref = parsed.offloadedRefs[0];
|
||||
expect(ref.mimeType).toBe("application/pdf");
|
||||
expect(ref.label).toBe("report.pdf");
|
||||
expect(ref.mediaRef).toMatch(/^media:\/\/inbound\//);
|
||||
// Non-image offloads MUST NOT inject a media://URI into the message —
|
||||
// the caller is responsible for routing offloadedRefs[].path into
|
||||
// ctx.MediaPaths so the workspace stage surfaces a real path.
|
||||
expect(parsed.message).toBe("read this");
|
||||
expect(saveMediaBufferMock).toHaveBeenCalledOnce();
|
||||
expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/pdf");
|
||||
expect(logs).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("offloads opaque binary when sniff and provided mime are both absent", async () => {
|
||||
const unknown = Buffer.from("just some bytes that do not match any signature").toString(
|
||||
"base64",
|
||||
);
|
||||
const { parsed, logs } = await parseWithWarnings("take a look", [
|
||||
{ type: "file", fileName: "blob.dat", content: unknown },
|
||||
]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/octet-stream");
|
||||
expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/octet-stream");
|
||||
expect(parsed.message).toBe("take a look");
|
||||
expect(logs).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("prefers sniffed mime type and logs mismatch", async () => {
|
||||
@@ -111,28 +175,9 @@ describe("parseMessageWithAttachments", () => {
|
||||
expect(logs[0]).toMatch(/mime mismatch/i);
|
||||
});
|
||||
|
||||
it("persists unknown non-image files when sniff fails", async () => {
|
||||
const unknown = Buffer.from("not an image").toString("base64");
|
||||
const { parsed, logs } = await parseWithWarnings("x", [
|
||||
{ type: "file", fileName: "unknown.bin", content: unknown },
|
||||
]);
|
||||
try {
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]).toMatchObject({
|
||||
label: "unknown.bin",
|
||||
mimeType: "application/octet-stream",
|
||||
});
|
||||
expect(parsed.message).toMatch(/^x\n\[media attached: media:\/\/inbound\//);
|
||||
expect(logs).toHaveLength(0);
|
||||
} finally {
|
||||
await cleanupOffloadedRefs(parsed.offloadedRefs);
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps valid images and drops invalid ones", async () => {
|
||||
it("keeps image inline and offloads non-image side by side", async () => {
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
const { parsed, logs } = await parseWithWarnings("x", [
|
||||
const { parsed } = await parseWithWarnings("x", [
|
||||
{
|
||||
type: "image",
|
||||
mimeType: "image/png",
|
||||
@@ -141,15 +186,196 @@ describe("parseMessageWithAttachments", () => {
|
||||
},
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "image/png",
|
||||
fileName: "not-image.pdf",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "report.pdf",
|
||||
content: pdf,
|
||||
},
|
||||
]);
|
||||
expect(parsed.images).toHaveLength(1);
|
||||
expect(parsed.images[0]?.mimeType).toBe("image/png");
|
||||
expect(parsed.images[0]?.data).toBe(PNG_1x1);
|
||||
expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
|
||||
expect(parsed.imageOrder).toEqual(["inline"]);
|
||||
});
|
||||
|
||||
it("excludes non-image offloads from imageOrder in mixed batches", async () => {
|
||||
// Regression: a prior revision pushed "offloaded" for every offload,
|
||||
// including non-image files. In a [non-image, inline, offloaded-image]
|
||||
// batch that produced imageOrder=["offloaded","inline","offloaded"] even
|
||||
// though only one `[media attached: media://...]` line is ever appended
|
||||
// to the prompt (for the image offload). extractTrailingAttachmentMediaUris
|
||||
// then read count=2 against one trailing URI, and
|
||||
// mergePromptAttachmentImages placed the single offloaded image into the
|
||||
// first "offloaded" slot — swapping it ahead of the inline image.
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
const bigPng = Buffer.alloc(2_100_000);
|
||||
bigPng.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0);
|
||||
const { parsed } = await parseWithWarnings("x", [
|
||||
{ type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
|
||||
{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 },
|
||||
{
|
||||
type: "image",
|
||||
mimeType: "image/png",
|
||||
fileName: "big.png",
|
||||
content: bigPng.toString("base64"),
|
||||
},
|
||||
]);
|
||||
expect(parsed.images).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs.map((ref) => ref.mimeType)).toEqual([
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
]);
|
||||
expect(parsed.imageOrder).toEqual(["inline", "offloaded"]);
|
||||
// The offloaded-image URI is the sole trailing media:// line, matching
|
||||
// imageOrder's single "offloaded" slot.
|
||||
const trailingMediaLines = parsed.message
|
||||
.split("\n")
|
||||
.filter((line) => line.trim().startsWith("[media attached: media://inbound/"));
|
||||
expect(trailingMediaLines).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("rejects oversized images before offload", async () => {
|
||||
const big = Buffer.alloc(MAX_IMAGE_BYTES + 1, 1).toString("base64");
|
||||
|
||||
await expect(
|
||||
parseMessageWithAttachments(
|
||||
"x",
|
||||
[{ type: "image", mimeType: "image/png", fileName: "huge.png", content: big }],
|
||||
{ maxBytes: resolveChatAttachmentMaxBytes({} as OpenClawConfig), log: { warn: () => {} } },
|
||||
),
|
||||
).rejects.toThrow(/image exceeds size limit/i);
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => {
|
||||
const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64");
|
||||
const { parsed } = await parseWithWarnings("x", [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
fileName: "spec.docx",
|
||||
content: docx,
|
||||
},
|
||||
]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.label).toBe("spec.docx");
|
||||
// Docx sniffs as application/zip; the provided OOXML mime must win so the
|
||||
// agent sees the real document type, not a generic archive.
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
);
|
||||
});
|
||||
|
||||
it("recovers specific mime from filename extension when sniff is generic and provided mime is absent", async () => {
|
||||
const xlsx = Buffer.from("PK\u0003\u0004fake-xlsx").toString("base64");
|
||||
const { parsed } = await parseWithWarnings("x", [
|
||||
{ type: "file", fileName: "sheet.xlsx", content: xlsx },
|
||||
]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
);
|
||||
});
|
||||
|
||||
it("accepts zip attachments via workspace offload", async () => {
|
||||
const zip = Buffer.from("PK\u0003\u0004zip-archive-bytes").toString("base64");
|
||||
const { parsed } = await parseWithWarnings("x", [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/zip",
|
||||
fileName: "bundle.zip",
|
||||
content: zip,
|
||||
},
|
||||
]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.label).toBe("bundle.zip");
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/zip");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseMessageWithAttachments validation errors", () => {
|
||||
it("throws UnsupportedAttachmentError on empty payload", async () => {
|
||||
await expect(
|
||||
parseMessageWithAttachments(
|
||||
"x",
|
||||
[{ type: "file", mimeType: "application/pdf", fileName: "empty.pdf", content: "" }],
|
||||
{ log: { warn: () => {} } },
|
||||
),
|
||||
).rejects.toMatchObject({
|
||||
name: "UnsupportedAttachmentError",
|
||||
reason: "empty-payload",
|
||||
});
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("throws UnsupportedAttachmentError on non-image when acceptNonImage is false", async () => {
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
let caught: unknown;
|
||||
try {
|
||||
await parseMessageWithAttachments(
|
||||
"x",
|
||||
[{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
|
||||
{ log: { warn: () => {} }, acceptNonImage: false },
|
||||
);
|
||||
} catch (err) {
|
||||
caught = err;
|
||||
}
|
||||
expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
|
||||
expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image");
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("rejects generic-container payloads mislabeled as images when acceptNonImage is false", async () => {
|
||||
const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64");
|
||||
let caught: unknown;
|
||||
try {
|
||||
await parseMessageWithAttachments(
|
||||
"x",
|
||||
[{ type: "file", mimeType: "image/png", fileName: "report.docx", content: docx }],
|
||||
{ log: { warn: () => {} }, acceptNonImage: false },
|
||||
);
|
||||
} catch (err) {
|
||||
caught = err;
|
||||
}
|
||||
expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
|
||||
expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image");
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("throws UnsupportedAttachmentError on image when supportsInlineImages is false", async () => {
|
||||
let caught: unknown;
|
||||
try {
|
||||
await parseMessageWithAttachments(
|
||||
"x",
|
||||
[{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }],
|
||||
{ log: { warn: () => {} }, supportsInlineImages: false },
|
||||
);
|
||||
} catch (err) {
|
||||
caught = err;
|
||||
}
|
||||
expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
|
||||
expect((caught as UnsupportedAttachmentError).reason).toBe("text-only-image");
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("still offloads non-image attachments when supportsInlineImages is false", async () => {
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
const { parsed } = await parseWithWarnings(
|
||||
"x",
|
||||
[{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
|
||||
{ supportsInlineImages: false },
|
||||
);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
|
||||
expect(saveMediaBufferMock).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("passes through unchanged on text-only session with no attachments", async () => {
|
||||
const { parsed } = await parseWithWarnings("hello", [], { supportsInlineImages: false });
|
||||
expect(parsed.message).toBe("hello");
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(parsed.offloadedRefs).toHaveLength(0);
|
||||
expect(saveMediaBufferMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("persists non-image file attachments as media refs", async () => {
|
||||
@@ -168,13 +394,13 @@ describe("parseMessageWithAttachments", () => {
|
||||
|
||||
try {
|
||||
expect(parsed.images).toHaveLength(0);
|
||||
expect(parsed.imageOrder).toEqual(["offloaded"]);
|
||||
expect(parsed.imageOrder).toEqual([]);
|
||||
expect(parsed.offloadedRefs).toHaveLength(1);
|
||||
expect(parsed.offloadedRefs[0]).toMatchObject({
|
||||
mimeType: "application/pdf",
|
||||
label: "brief.pdf",
|
||||
});
|
||||
expect(parsed.message).toMatch(/^read this\n\[media attached: media:\/\/inbound\//);
|
||||
expect(parsed.message).toBe("read this");
|
||||
} finally {
|
||||
await cleanupOffloadedRefs(parsed.offloadedRefs);
|
||||
}
|
||||
@@ -258,6 +484,32 @@ describe("parseMessageWithAttachments", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveChatAttachmentMaxBytes", () => {
|
||||
const MB = 1024 * 1024;
|
||||
const DEFAULT_BYTES = DEFAULT_CHAT_ATTACHMENT_MAX_MB * MB;
|
||||
|
||||
const cfgWithMediaMaxMb = (value: unknown): OpenClawConfig =>
|
||||
({ agents: { defaults: { mediaMaxMb: value } } }) as unknown as OpenClawConfig;
|
||||
|
||||
it("honours a configured agents.defaults.mediaMaxMb", () => {
|
||||
expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(10))).toBe(10 * MB);
|
||||
expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(50))).toBe(50 * MB);
|
||||
});
|
||||
|
||||
it("falls back to DEFAULT_CHAT_ATTACHMENT_MAX_MB when unset", () => {
|
||||
expect(resolveChatAttachmentMaxBytes({} as OpenClawConfig)).toBe(DEFAULT_BYTES);
|
||||
expect(resolveChatAttachmentMaxBytes({ agents: {} } as unknown as OpenClawConfig)).toBe(
|
||||
DEFAULT_BYTES,
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects non-positive, non-finite, or non-number values", () => {
|
||||
for (const bad of [0, -5, Number.NaN, Number.POSITIVE_INFINITY, "50", null, undefined]) {
|
||||
expect(resolveChatAttachmentMaxBytes(cfgWithMediaMaxMb(bad))).toBe(DEFAULT_BYTES);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("shared attachment validation", () => {
|
||||
it("rejects invalid base64 content for both builder and parser", async () => {
|
||||
const bad: ChatAttachment = {
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import { formatErrorMessage } from "../infra/errors.js";
|
||||
import { estimateBase64DecodedBytes } from "../media/base64.js";
|
||||
import { MAX_IMAGE_BYTES } from "../media/constants.js";
|
||||
import { extensionForMime, mimeTypeFromFilePath } from "../media/mime.js";
|
||||
import type { PromptImageOrderEntry } from "../media/prompt-image-order.js";
|
||||
import { sniffMimeFromBase64 } from "../media/sniff-mime-from-base64.js";
|
||||
import { deleteMediaBuffer, saveMediaBuffer } from "../media/store.js";
|
||||
import {
|
||||
normalizeLowercaseStringOrEmpty,
|
||||
normalizeOptionalLowercaseString,
|
||||
} from "../shared/string-coerce.js";
|
||||
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
|
||||
|
||||
export type ChatAttachment = {
|
||||
type?: string;
|
||||
@@ -21,47 +21,19 @@ export type ChatImageContent = {
|
||||
mimeType: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Metadata for an attachment that was offloaded to the media store.
|
||||
*
|
||||
* Included in ParsedMessageWithImages.offloadedRefs so that callers can
|
||||
* persist structured media metadata for transcripts. Without this, consumers
|
||||
* that derive MediaPath/MediaPaths from the `images` array (e.g.
|
||||
* persistChatSendImages and buildChatSendTranscriptMessage in chat.ts) would
|
||||
* silently omit all large attachments that were offloaded to disk.
|
||||
*/
|
||||
export type OffloadedRef = {
|
||||
/** Opaque media URI injected into the message, e.g. "media://inbound/<id>" */
|
||||
mediaRef: string;
|
||||
/** The raw media ID from SavedMedia.id, usable with resolveMediaBufferPath */
|
||||
id: string;
|
||||
/** Absolute filesystem path returned by saveMediaBuffer — used for transcript MediaPath */
|
||||
path: string;
|
||||
/** MIME type of the offloaded attachment */
|
||||
mimeType: string;
|
||||
/** The label / filename of the original attachment */
|
||||
label: string;
|
||||
sizeBytes: number;
|
||||
};
|
||||
|
||||
export type ParsedMessageWithImages = {
|
||||
message: string;
|
||||
/** Small attachments (≤ OFFLOAD_THRESHOLD_BYTES) passed inline to the model */
|
||||
images: ChatImageContent[];
|
||||
/** Original accepted attachment order after inline/offloaded split. */
|
||||
imageOrder: PromptImageOrderEntry[];
|
||||
/**
|
||||
* Large attachments (> OFFLOAD_THRESHOLD_BYTES) that were offloaded to the
|
||||
* media store. Each entry corresponds to a `[media attached: media://inbound/<id>]`
|
||||
* marker appended to `message`.
|
||||
*
|
||||
* Callers MUST persist this list separately for transcript media metadata.
|
||||
* It is intentionally separate from `images` because downstream model calls
|
||||
* do not receive these as inline image blocks.
|
||||
*
|
||||
* ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
|
||||
* `supportsImages: modelSupportsImages(model)` so text-only model runs
|
||||
* offload images as media refs instead of passing inline image blocks.
|
||||
*/
|
||||
offloadedRefs: OffloadedRef[];
|
||||
};
|
||||
|
||||
@@ -78,49 +50,38 @@ type NormalizedAttachment = {
|
||||
|
||||
type SavedMedia = {
|
||||
id: string;
|
||||
path?: string;
|
||||
path: string;
|
||||
};
|
||||
|
||||
const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
|
||||
const TEXT_ONLY_OFFLOAD_LIMIT = 10;
|
||||
|
||||
const MIME_TO_EXT: Record<string, string> = {
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/webp": ".webp",
|
||||
"image/gif": ".gif",
|
||||
"image/heic": ".heic",
|
||||
"image/heif": ".heif",
|
||||
// bmp/tiff excluded from SUPPORTED_OFFLOAD_MIMES to avoid extension-loss
|
||||
// bug in store.ts; entries kept here for future extension support
|
||||
"image/bmp": ".bmp",
|
||||
"image/tiff": ".tiff",
|
||||
};
|
||||
export const DEFAULT_CHAT_ATTACHMENT_MAX_MB = 20;
|
||||
|
||||
// Module-level Set for O(1) lookup — not rebuilt on every attachment iteration.
|
||||
//
|
||||
// heic/heif are included only if store.ts's extensionForMime maps them to an
|
||||
// extension. If it does not (same extension-loss risk as bmp/tiff), remove
|
||||
// them from this set.
|
||||
const SUPPORTED_OFFLOAD_MIMES = new Set([
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/png",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
"image/heic",
|
||||
"image/heif",
|
||||
]);
|
||||
export function resolveChatAttachmentMaxBytes(cfg: OpenClawConfig): number {
|
||||
const configured = cfg.agents?.defaults?.mediaMaxMb;
|
||||
const mb =
|
||||
typeof configured === "number" && Number.isFinite(configured) && configured > 0
|
||||
? configured
|
||||
: DEFAULT_CHAT_ATTACHMENT_MAX_MB;
|
||||
return Math.floor(mb * 1024 * 1024);
|
||||
}
|
||||
|
||||
export type UnsupportedAttachmentReason =
|
||||
| "empty-payload"
|
||||
| "text-only-image"
|
||||
| "unsupported-non-image"
|
||||
| "non-image-too-large-for-sandbox";
|
||||
|
||||
export class UnsupportedAttachmentError extends Error {
|
||||
readonly reason: UnsupportedAttachmentReason;
|
||||
constructor(reason: UnsupportedAttachmentReason, message: string) {
|
||||
super(message);
|
||||
this.name = "UnsupportedAttachmentError";
|
||||
this.reason = reason;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Raised when the Gateway cannot persist an attachment to the media store.
|
||||
*
|
||||
* Distinct from ordinary input-validation errors so that Gateway handlers can
|
||||
* map it to a server-side 5xx status rather than a client 4xx.
|
||||
*
|
||||
* Example causes: ENOSPC, EPERM, unexpected saveMediaBuffer return shape.
|
||||
*/
|
||||
export class MediaOffloadError extends Error {
|
||||
readonly cause: unknown;
|
||||
constructor(message: string, options?: ErrorOptions) {
|
||||
@@ -142,42 +103,24 @@ function isImageMime(mime?: string): boolean {
|
||||
return typeof mime === "string" && mime.startsWith("image/");
|
||||
}
|
||||
|
||||
function isVideoMime(mime?: string): boolean {
|
||||
return typeof mime === "string" && mime.startsWith("video/");
|
||||
function isGenericContainerMime(mime?: string): boolean {
|
||||
return mime === "application/zip" || mime === "application/octet-stream";
|
||||
}
|
||||
|
||||
function isGenericMime(mime?: string): boolean {
|
||||
return (
|
||||
!mime ||
|
||||
mime === "application/octet-stream" ||
|
||||
mime === "binary/octet-stream" ||
|
||||
mime === "application/unknown"
|
||||
);
|
||||
function shouldIgnoreProvidedImageMime(params: {
|
||||
sniffedMime?: string;
|
||||
providedMime?: string;
|
||||
}): boolean {
|
||||
return isGenericContainerMime(params.sniffedMime) && isImageMime(params.providedMime);
|
||||
}
|
||||
|
||||
function isValidBase64(value: string): boolean {
|
||||
if (value.length === 0 || value.length % 4 !== 0) {
|
||||
return false;
|
||||
}
|
||||
// A full O(n) regex scan is safe: no overlapping quantifiers, fails linearly.
|
||||
// Prevents adversarial payloads padded with megabytes of whitespace from
|
||||
// bypassing length thresholds.
|
||||
return /^[A-Za-z0-9+/]+={0,2}$/.test(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirms that the decoded buffer produced by Buffer.from(b64, 'base64')
|
||||
* matches the pre-decode size estimate.
|
||||
*
|
||||
* Node's Buffer.from silently drops invalid base64 characters rather than
|
||||
* throwing. A material size discrepancy means the source string contained
|
||||
* embedded garbage that was silently stripped, which would produce a corrupted
|
||||
* file on disk. ±3 bytes of leeway accounts for base64 padding rounding.
|
||||
*
|
||||
* IMPORTANT: this is an input-validation check (4xx client error).
|
||||
* It MUST be called OUTSIDE the MediaOffloadError try/catch so that
|
||||
* corrupt-input errors are not misclassified as 5xx server errors.
|
||||
*/
|
||||
function verifyDecodedSize(buffer: Buffer, estimatedBytes: number, label: string): void {
|
||||
if (Math.abs(buffer.byteLength - estimatedBytes) > 3) {
|
||||
throw new Error(
|
||||
@@ -191,40 +134,34 @@ function ensureExtension(label: string, mime: string): string {
|
||||
if (/\.[a-zA-Z0-9]+$/.test(label)) {
|
||||
return label;
|
||||
}
|
||||
const ext = MIME_TO_EXT[normalizeLowercaseStringOrEmpty(mime)] ?? "";
|
||||
const ext = extensionForMime(mime) ?? "";
|
||||
return ext ? `${label}${ext}` : label;
|
||||
}
|
||||
|
||||
/**
|
||||
* Type guard for the return value of saveMediaBuffer.
|
||||
*
|
||||
* Also validates that the returned ID:
|
||||
* - is a non-empty string
|
||||
* - contains no path separators (/ or \) or null bytes
|
||||
*
|
||||
* Catching a bad shape here produces a cleaner error than a cryptic failure
|
||||
* deeper in the stack, and is treated as a 5xx infrastructure error.
|
||||
*/
|
||||
function assertSavedMedia(value: unknown, label: string): SavedMedia {
|
||||
if (
|
||||
value !== null &&
|
||||
typeof value === "object" &&
|
||||
"id" in value &&
|
||||
typeof (value as Record<string, unknown>).id === "string"
|
||||
value === null ||
|
||||
typeof value !== "object" ||
|
||||
!("id" in value) ||
|
||||
typeof (value as Record<string, unknown>).id !== "string"
|
||||
) {
|
||||
const id = (value as Record<string, unknown>).id as string;
|
||||
if (id.length === 0) {
|
||||
throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
|
||||
}
|
||||
if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
|
||||
throw new Error(
|
||||
`attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
|
||||
`(contains path separator or null byte)`,
|
||||
);
|
||||
}
|
||||
return value as SavedMedia;
|
||||
throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
|
||||
}
|
||||
throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
|
||||
const id = (value as Record<string, unknown>).id as string;
|
||||
if (id.length === 0) {
|
||||
throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
|
||||
}
|
||||
if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
|
||||
throw new Error(
|
||||
`attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
|
||||
`(contains path separator or null byte)`,
|
||||
);
|
||||
}
|
||||
const path = (value as Record<string, unknown>).path;
|
||||
if (typeof path !== "string" || path.length === 0) {
|
||||
throw new Error(`attachment ${label}: saveMediaBuffer returned no on-disk path`);
|
||||
}
|
||||
return { id, path };
|
||||
}
|
||||
|
||||
function normalizeAttachment(
|
||||
@@ -269,47 +206,22 @@ function validateAttachmentBase64OrThrow(
|
||||
return sizeBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse attachments and extract images as structured content blocks.
|
||||
* Returns the message text, inline image blocks, and offloaded media refs.
|
||||
*
|
||||
* ## Offload behaviour
|
||||
* Attachments whose decoded size exceeds OFFLOAD_THRESHOLD_BYTES are saved to
|
||||
* disk via saveMediaBuffer and replaced with an opaque `media://inbound/<id>`
|
||||
* URI appended to the message. The agent resolves these URIs via
|
||||
* resolveMediaBufferPath before passing them to the model.
|
||||
*
|
||||
* ## Transcript metadata
|
||||
* Callers MUST use `result.offloadedRefs` to persist structured media metadata
|
||||
* for transcripts. These refs are intentionally excluded from `result.images`
|
||||
* because they are not passed inline to the model.
|
||||
*
|
||||
* ## Text-only model runs
|
||||
* Pass `supportsImages: false` for text-only model runs so images are offloaded
|
||||
* as `media://inbound/<id>` refs instead of being sent as inline image blocks.
|
||||
* The agent runner can then resolve the refs through the normal media path.
|
||||
*
|
||||
* ## Cleanup on failure
|
||||
* On any parse failure after files have already been offloaded, best-effort
|
||||
* cleanup is performed before rethrowing so that malformed requests do not
|
||||
* accumulate orphaned files on disk ahead of the periodic TTL sweep.
|
||||
*
|
||||
* ## Known ordering limitation
|
||||
* In mixed large/small batches, the model receives images in a different order
|
||||
* than the original attachment list because detectAndLoadPromptImages
|
||||
* initialises from existingImages first, then appends prompt-detected refs.
|
||||
* A future refactor should unify all image references into a single ordered list.
|
||||
*
|
||||
* @throws {MediaOffloadError} Infrastructure failure saving to media store → 5xx.
|
||||
* @throws {Error} Input validation failure → 4xx.
|
||||
*/
|
||||
export async function parseMessageWithAttachments(
|
||||
message: string,
|
||||
attachments: ChatAttachment[] | undefined,
|
||||
opts?: { maxBytes?: number; log?: AttachmentLog; supportsImages?: boolean },
|
||||
opts?: {
|
||||
maxBytes?: number;
|
||||
log?: AttachmentLog;
|
||||
supportsImages?: boolean;
|
||||
supportsInlineImages?: boolean;
|
||||
acceptNonImage?: boolean;
|
||||
},
|
||||
): Promise<ParsedMessageWithImages> {
|
||||
const maxBytes = opts?.maxBytes ?? 5_000_000;
|
||||
const maxBytes = opts?.maxBytes ?? DEFAULT_CHAT_ATTACHMENT_MAX_MB * 1024 * 1024;
|
||||
const log = opts?.log;
|
||||
const shouldForceImageOffload = opts?.supportsImages === false;
|
||||
const supportsInlineImages = opts?.supportsInlineImages !== false;
|
||||
const acceptNonImage = opts?.acceptNonImage !== false;
|
||||
|
||||
if (!attachments || attachments.length === 0) {
|
||||
return { message, images: [], imageOrder: [], offloadedRefs: [] };
|
||||
@@ -319,11 +231,7 @@ export async function parseMessageWithAttachments(
|
||||
const imageOrder: PromptImageOrderEntry[] = [];
|
||||
const offloadedRefs: OffloadedRef[] = [];
|
||||
let updatedMessage = message;
|
||||
const shouldForceOffload = opts?.supportsImages === false;
|
||||
let textOnlyImageOffloadCount = 0;
|
||||
|
||||
// Track IDs of files saved during this request for cleanup if a later
|
||||
// attachment fails validation and the entire parse is aborted.
|
||||
const savedMediaIds: string[] = [];
|
||||
|
||||
try {
|
||||
@@ -339,16 +247,14 @@ export async function parseMessageWithAttachments(
|
||||
|
||||
const { base64: b64, label, mime } = normalized;
|
||||
|
||||
if (b64.length === 0) {
|
||||
throw new UnsupportedAttachmentError("empty-payload", `attachment ${label}: empty payload`);
|
||||
}
|
||||
if (!isValidBase64(b64)) {
|
||||
throw new Error(`attachment ${label}: invalid base64 content`);
|
||||
}
|
||||
|
||||
const sizeBytes = estimateBase64DecodedBytes(b64);
|
||||
if (sizeBytes <= 0) {
|
||||
log?.warn(`attachment ${label}: estimated size is zero, dropping`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (sizeBytes > maxBytes) {
|
||||
throw new Error(
|
||||
`attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`,
|
||||
@@ -357,67 +263,74 @@ export async function parseMessageWithAttachments(
|
||||
|
||||
const providedMime = normalizeMime(mime);
|
||||
const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64));
|
||||
const labelMime = normalizeMime(mimeTypeFromFilePath(label));
|
||||
const trustedProvidedMime = shouldIgnoreProvidedImageMime({ sniffedMime, providedMime })
|
||||
? undefined
|
||||
: providedMime;
|
||||
|
||||
if (sniffedMime && !isImageMime(sniffedMime) && isImageMime(providedMime)) {
|
||||
log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`);
|
||||
continue;
|
||||
}
|
||||
// Prefer specific MIME signals over generic container types. OOXML
|
||||
// documents (docx/xlsx/pptx) sniff as application/zip; without this
|
||||
// priority the agent would receive a `.zip` instead of the specific
|
||||
// Office document the caller declared.
|
||||
const finalMime =
|
||||
(sniffedMime && !isGenericContainerMime(sniffedMime) && sniffedMime) ||
|
||||
(trustedProvidedMime &&
|
||||
!isGenericContainerMime(trustedProvidedMime) &&
|
||||
trustedProvidedMime) ||
|
||||
(labelMime && !isGenericContainerMime(labelMime) && labelMime) ||
|
||||
sniffedMime ||
|
||||
trustedProvidedMime ||
|
||||
labelMime ||
|
||||
"application/octet-stream";
|
||||
|
||||
const shouldHandleAsImage =
|
||||
isImageMime(sniffedMime) || (isImageMime(providedMime) && !sniffedMime);
|
||||
if (!shouldHandleAsImage) {
|
||||
const finalMime = sniffedMime ?? providedMime ?? "application/octet-stream";
|
||||
if (isVideoMime(finalMime)) {
|
||||
log?.warn(`attachment ${label}: video attachments are not supported, dropping`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const buffer = Buffer.from(b64, "base64");
|
||||
verifyDecodedSize(buffer, sizeBytes, label);
|
||||
|
||||
try {
|
||||
const rawResult = await saveMediaBuffer(buffer, finalMime, "inbound", maxBytes, label);
|
||||
const savedMedia = assertSavedMedia(rawResult, label);
|
||||
savedMediaIds.push(savedMedia.id);
|
||||
|
||||
const mediaRef = `media://inbound/${savedMedia.id}`;
|
||||
updatedMessage += `\n[media attached: ${mediaRef}]`;
|
||||
log?.info?.(`[Gateway] Saved file attachment. Saved: ${mediaRef}`);
|
||||
offloadedRefs.push({
|
||||
mediaRef,
|
||||
id: savedMedia.id,
|
||||
path: savedMedia.path ?? "",
|
||||
mimeType: finalMime,
|
||||
label,
|
||||
});
|
||||
imageOrder.push("offloaded");
|
||||
} catch (err) {
|
||||
const errorMessage = formatErrorMessage(err);
|
||||
throw new MediaOffloadError(
|
||||
`[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
|
||||
{ cause: err },
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
sniffedMime &&
|
||||
providedMime &&
|
||||
!isGenericMime(providedMime) &&
|
||||
!isGenericContainerMime(providedMime) &&
|
||||
sniffedMime !== providedMime
|
||||
) {
|
||||
const usedSource =
|
||||
finalMime === sniffedMime
|
||||
? "sniffed"
|
||||
: finalMime === providedMime
|
||||
? "provided"
|
||||
: "label-derived";
|
||||
log?.warn(
|
||||
`attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`,
|
||||
`attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using ${usedSource}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Third fallback normalises `mime` so a raw un-normalised string (e.g.
|
||||
// "IMAGE/JPEG") does not silently bypass the SUPPORTED_OFFLOAD_MIMES check.
|
||||
const finalMime = sniffedMime ?? providedMime ?? normalizeMime(mime) ?? mime;
|
||||
const isImage = isImageMime(finalMime);
|
||||
if (isImage && !supportsInlineImages && !shouldForceImageOffload) {
|
||||
throw new UnsupportedAttachmentError(
|
||||
"text-only-image",
|
||||
`attachment ${label}: active model does not accept image inputs`,
|
||||
);
|
||||
}
|
||||
if (!isImage && !acceptNonImage) {
|
||||
throw new UnsupportedAttachmentError(
|
||||
"unsupported-non-image",
|
||||
`attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`,
|
||||
);
|
||||
}
|
||||
// Agent-side hydration (loadImageFromRef via optimizeAndClampImage / GIF
|
||||
// direct compare) caps at MAX_IMAGE_BYTES. Accepting images above that
|
||||
// would offload a file the runner later drops to null — a successful
|
||||
// response with a silently missing image. Reject here so the client
|
||||
// sees an explicit 4xx. Non-image attachments keep the full maxBytes
|
||||
// ceiling because their host path (ctx.MediaPaths → Read/Bash) doesn't
|
||||
// load into the model.
|
||||
if (isImage && sizeBytes > MAX_IMAGE_BYTES) {
|
||||
throw new Error(
|
||||
`attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`,
|
||||
);
|
||||
}
|
||||
|
||||
let isOffloaded = false;
|
||||
|
||||
if (shouldForceOffload && textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT) {
|
||||
if (
|
||||
shouldForceImageOffload &&
|
||||
isImage &&
|
||||
textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT
|
||||
) {
|
||||
log?.warn(
|
||||
`attachment ${label}: dropping image because text-only offload limit ` +
|
||||
`${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
|
||||
@@ -426,93 +339,64 @@ export async function parseMessageWithAttachments(
|
||||
continue;
|
||||
}
|
||||
|
||||
if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
|
||||
const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);
|
||||
const shouldOffload =
|
||||
shouldForceImageOffload || !isImage || sizeBytes > OFFLOAD_THRESHOLD_BYTES;
|
||||
|
||||
if (!isSupportedForOffload) {
|
||||
if (shouldForceOffload) {
|
||||
log?.warn(
|
||||
`attachment ${label}: format ${finalMime} cannot be offloaded for ` +
|
||||
"text-only model, dropping",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
// Passing this inline would reintroduce the OOM risk this PR prevents.
|
||||
throw new Error(
|
||||
`attachment ${label}: format ${finalMime} is too large to pass inline ` +
|
||||
`(${sizeBytes} > ${OFFLOAD_THRESHOLD_BYTES} bytes) and cannot be offloaded. ` +
|
||||
`Please convert to JPEG, PNG, WEBP, GIF, HEIC, or HEIF.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Decode and run input-validation BEFORE the MediaOffloadError try/catch.
|
||||
// verifyDecodedSize is a 4xx client error and must not be wrapped as a
|
||||
// 5xx MediaOffloadError.
|
||||
const buffer = Buffer.from(b64, "base64");
|
||||
verifyDecodedSize(buffer, sizeBytes, label);
|
||||
|
||||
// Only the storage operation is wrapped so callers can distinguish
|
||||
// infrastructure failures (5xx) from input errors (4xx).
|
||||
try {
|
||||
const labelWithExt = ensureExtension(label, finalMime);
|
||||
|
||||
const rawResult = await saveMediaBuffer(
|
||||
buffer,
|
||||
finalMime,
|
||||
"inbound",
|
||||
maxBytes,
|
||||
labelWithExt,
|
||||
);
|
||||
|
||||
const savedMedia = assertSavedMedia(rawResult, label);
|
||||
|
||||
// Track for cleanup if a subsequent attachment fails.
|
||||
savedMediaIds.push(savedMedia.id);
|
||||
|
||||
// Opaque URI — compatible with workspaceOnly sandboxes and decouples
|
||||
// the Gateway from the agent's filesystem layout.
|
||||
const mediaRef = `media://inbound/${savedMedia.id}`;
|
||||
|
||||
updatedMessage += `\n[media attached: ${mediaRef}]`;
|
||||
log?.info?.(
|
||||
shouldForceOffload
|
||||
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
|
||||
: `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
|
||||
);
|
||||
|
||||
// Record for transcript metadata — separate from `images` because
|
||||
// these are not passed inline to the model.
|
||||
offloadedRefs.push({
|
||||
mediaRef,
|
||||
id: savedMedia.id,
|
||||
path: savedMedia.path ?? "",
|
||||
mimeType: finalMime,
|
||||
label,
|
||||
});
|
||||
imageOrder.push("offloaded");
|
||||
if (shouldForceOffload) {
|
||||
textOnlyImageOffloadCount++;
|
||||
}
|
||||
|
||||
isOffloaded = true;
|
||||
} catch (err) {
|
||||
const errorMessage = formatErrorMessage(err);
|
||||
throw new MediaOffloadError(
|
||||
`[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
|
||||
{ cause: err },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (isOffloaded) {
|
||||
if (!shouldOffload) {
|
||||
images.push({ type: "image", data: b64, mimeType: finalMime });
|
||||
imageOrder.push("inline");
|
||||
continue;
|
||||
}
|
||||
|
||||
images.push({ type: "image", data: b64, mimeType: finalMime });
|
||||
imageOrder.push("inline");
|
||||
const buffer = Buffer.from(b64, "base64");
|
||||
verifyDecodedSize(buffer, sizeBytes, label);
|
||||
|
||||
let savedMedia: SavedMedia;
|
||||
try {
|
||||
const labelWithExt = ensureExtension(label, finalMime);
|
||||
const rawResult = await saveMediaBuffer(
|
||||
buffer,
|
||||
finalMime,
|
||||
"inbound",
|
||||
maxBytes,
|
||||
labelWithExt,
|
||||
);
|
||||
savedMedia = assertSavedMedia(rawResult, label);
|
||||
} catch (err) {
|
||||
throw new MediaOffloadError(
|
||||
`[Gateway Error] Failed to save intercepted media to disk: ${formatErrorMessage(err)}`,
|
||||
{ cause: err },
|
||||
);
|
||||
}
|
||||
|
||||
savedMediaIds.push(savedMedia.id);
|
||||
|
||||
const mediaRef = `media://inbound/${savedMedia.id}`;
|
||||
if (isImage) {
|
||||
updatedMessage += `\n[media attached: ${mediaRef}]`;
|
||||
}
|
||||
log?.info?.(
|
||||
shouldForceImageOffload && isImage
|
||||
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
|
||||
: `[Gateway] Offloaded attachment (${finalMime}). Saved: ${mediaRef}`,
|
||||
);
|
||||
|
||||
offloadedRefs.push({
|
||||
mediaRef,
|
||||
id: savedMedia.id,
|
||||
path: savedMedia.path,
|
||||
mimeType: finalMime,
|
||||
label,
|
||||
sizeBytes,
|
||||
});
|
||||
if (isImage) {
|
||||
imageOrder.push("offloaded");
|
||||
if (shouldForceImageOffload) {
|
||||
textOnlyImageOffloadCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Best-effort cleanup before rethrowing.
|
||||
if (savedMediaIds.length > 0) {
|
||||
await Promise.allSettled(savedMediaIds.map((id) => deleteMediaBuffer(id, "inbound")));
|
||||
}
|
||||
|
||||
@@ -79,7 +79,11 @@ import {
|
||||
} from "../../utils/message-channel.js";
|
||||
import { resolveAssistantIdentity } from "../assistant-identity.js";
|
||||
import { registerChatAbortController, resolveAgentRunExpiresAtMs } from "../chat-abort.js";
|
||||
import { MediaOffloadError, parseMessageWithAttachments } from "../chat-attachments.js";
|
||||
import {
|
||||
MediaOffloadError,
|
||||
parseMessageWithAttachments,
|
||||
resolveChatAttachmentMaxBytes,
|
||||
} from "../chat-attachments.js";
|
||||
import { resolveAssistantAvatarUrl } from "../control-ui-shared.js";
|
||||
import { ADMIN_SCOPE } from "../method-scopes.js";
|
||||
import { GATEWAY_CLIENT_CAPS, hasGatewayClientCap } from "../protocol/client-info.js";
|
||||
@@ -498,7 +502,7 @@ export const agentHandlers: GatewayRequestHandlers = {
|
||||
}
|
||||
const effectiveProvider = providerOverride || baseProvider;
|
||||
const effectiveModel = modelOverride || baseModel;
|
||||
const supportsImages = await resolveGatewayModelSupportsImages({
|
||||
const supportsInlineImages = await resolveGatewayModelSupportsImages({
|
||||
loadGatewayModelCatalog: context.loadGatewayModelCatalog,
|
||||
provider: effectiveProvider,
|
||||
model: effectiveModel,
|
||||
@@ -506,9 +510,13 @@ export const agentHandlers: GatewayRequestHandlers = {
|
||||
|
||||
try {
|
||||
const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
|
||||
maxBytes: 5_000_000,
|
||||
maxBytes: resolveChatAttachmentMaxBytes(cfg),
|
||||
log: context.logGateway,
|
||||
supportsImages,
|
||||
supportsInlineImages,
|
||||
// agent.run does not yet wire a ctx.MediaPaths stage path, so reject
|
||||
// non-image attachments explicitly (UnsupportedAttachmentError)
|
||||
// instead of saving them where the agent cannot reach them.
|
||||
acceptNonImage: false,
|
||||
});
|
||||
message = parsed.message.trim();
|
||||
images = parsed.images;
|
||||
|
||||
@@ -66,6 +66,15 @@ const mockState = vi.hoisted(() => ({
|
||||
saveMediaWait: null as Promise<void> | null,
|
||||
activeSaveMediaCalls: 0,
|
||||
maxActiveSaveMediaCalls: 0,
|
||||
sandboxWorkspace: null as { workspaceDir: string; containerWorkdir?: string } | null,
|
||||
stageSandboxMediaError: null as Error | null,
|
||||
stagedRelativePaths: null as string[] | null,
|
||||
// `unstagedSources` lets tests simulate partial staging failure: absolute
|
||||
// source paths listed here are excluded from the returned `staged` map even
|
||||
// though ctx still carries their rewritten paths. This mirrors how the real
|
||||
// stageSandboxMedia silently skips over-cap files.
|
||||
unstagedSources: null as string[] | null,
|
||||
deleteMediaBufferCalls: [] as Array<{ id: string; subdir?: string }>,
|
||||
}));
|
||||
|
||||
const bindingMocks = vi.hoisted(() => ({
|
||||
@@ -216,11 +225,54 @@ vi.mock("../../sessions/transcript-events.js", () => ({
|
||||
),
|
||||
}));
|
||||
|
||||
vi.mock("../../agents/sandbox/context.js", async () => {
|
||||
const original = await vi.importActual<typeof import("../../agents/sandbox/context.js")>(
|
||||
"../../agents/sandbox/context.js",
|
||||
);
|
||||
return {
|
||||
...original,
|
||||
ensureSandboxWorkspaceForSession: vi.fn(async () => mockState.sandboxWorkspace),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("../../auto-reply/reply/stage-sandbox-media.js", () => ({
|
||||
stageSandboxMedia: vi.fn(
|
||||
async (params: { ctx: { MediaPaths?: string[]; MediaPath?: string } }) => {
|
||||
if (mockState.stageSandboxMediaError) {
|
||||
throw mockState.stageSandboxMediaError;
|
||||
}
|
||||
const staged = new Map<string, string>();
|
||||
const originalPaths = params.ctx.MediaPaths ?? [];
|
||||
if (mockState.stagedRelativePaths) {
|
||||
const mapping = mockState.stagedRelativePaths;
|
||||
params.ctx.MediaPaths = [...mapping];
|
||||
params.ctx.MediaPath = mapping[0];
|
||||
for (let i = 0; i < mapping.length; i += 1) {
|
||||
const source = originalPaths[i];
|
||||
const dest = mapping[i];
|
||||
if (source && dest) {
|
||||
staged.set(source, dest);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mockState.unstagedSources) {
|
||||
for (const source of mockState.unstagedSources) {
|
||||
staged.delete(source);
|
||||
}
|
||||
}
|
||||
return { staged };
|
||||
},
|
||||
),
|
||||
}));
|
||||
|
||||
vi.mock("../../media/store.js", async () => {
|
||||
const original =
|
||||
await vi.importActual<typeof import("../../media/store.js")>("../../media/store.js");
|
||||
return {
|
||||
...original,
|
||||
deleteMediaBuffer: vi.fn(async (id: string, subdir?: string) => {
|
||||
mockState.deleteMediaBufferCalls.push({ id, subdir });
|
||||
}),
|
||||
saveMediaBuffer: vi.fn(async (buffer: Buffer, contentType?: string, subdir?: string) => {
|
||||
mockState.activeSaveMediaCalls += 1;
|
||||
mockState.maxActiveSaveMediaCalls = Math.max(
|
||||
@@ -465,6 +517,11 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
mockState.maxActiveSaveMediaCalls = 0;
|
||||
bindingMocks.resolveByConversation.mockReset();
|
||||
bindingMocks.resolveByConversation.mockReturnValue(null);
|
||||
mockState.sandboxWorkspace = null;
|
||||
mockState.stageSandboxMediaError = null;
|
||||
mockState.stagedRelativePaths = null;
|
||||
mockState.unstagedSources = null;
|
||||
mockState.deleteMediaBufferCalls = [];
|
||||
});
|
||||
|
||||
it("registers tool-event recipients for clients advertising tool-events capability", async () => {
|
||||
@@ -1965,10 +2022,8 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
}
|
||||
| undefined;
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
|
||||
expect(mockState.lastDispatchCtx?.Body).toMatch(
|
||||
/^summarize this\n\[media attached: media:\/\/inbound\//,
|
||||
);
|
||||
expect(mockState.lastDispatchImageOrder).toBeUndefined();
|
||||
expect(mockState.lastDispatchCtx?.Body).toBe("summarize this");
|
||||
expect(mockState.savedMediaCalls).toEqual([
|
||||
expect.objectContaining({
|
||||
contentType: "application/pdf",
|
||||
@@ -1976,7 +2031,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
size: expect.any(Number),
|
||||
}),
|
||||
]);
|
||||
expect(message?.content).toMatch(/^summarize this\n\[media attached: media:\/\/inbound\//);
|
||||
expect(message?.content).toBe("summarize this");
|
||||
expect(message?.MediaPath).toBe("/tmp/chat-send-brief.pdf");
|
||||
expect(message?.MediaPaths).toEqual(["/tmp/chat-send-brief.pdf"]);
|
||||
expect(message?.MediaType).toBe("application/pdf");
|
||||
@@ -2335,6 +2390,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
waitFor: "none",
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
@@ -2449,6 +2505,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
waitFor: "none",
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
@@ -2461,6 +2518,291 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
]);
|
||||
});
|
||||
|
||||
it("routes non-image offloaded refs into ctx.MediaPaths + MediaTypes for chat.send", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-non-image-ctx-media-paths-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
modelProvider: "test-provider",
|
||||
model: "vision-model",
|
||||
};
|
||||
mockState.modelCatalog = [
|
||||
{
|
||||
provider: "test-provider",
|
||||
id: "vision-model",
|
||||
name: "Vision model",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
];
|
||||
mockState.savedMediaResults = [
|
||||
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
|
||||
];
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-non-image-ctx-media",
|
||||
message: "read this",
|
||||
requestParams: {
|
||||
attachments: [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "report.pdf",
|
||||
content: pdf,
|
||||
},
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([
|
||||
"/home/user/.openclaw/media/inbound/report.pdf",
|
||||
]);
|
||||
expect(mockState.lastDispatchCtx?.MediaPath).toBe(
|
||||
"/home/user/.openclaw/media/inbound/report.pdf",
|
||||
);
|
||||
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["application/pdf"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaType).toBe("application/pdf");
|
||||
// Non-image offloads MUST NOT inject a media://URI into the prompt body —
|
||||
// they ride through ctx.MediaPaths so buildInboundMediaNote prepends the
|
||||
// real path, avoiding duplicate media markers.
|
||||
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
|
||||
expect(mockState.lastDispatchCtx?.BodyForAgent).not.toContain("media://");
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
// Marker replaces the implicit "relative-path no-op" coupling in
|
||||
// get-reply.ts with an explicit skip contract.
|
||||
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
|
||||
});
|
||||
|
||||
it("preserves sandbox-relative MediaPaths and stores workspace context for media-understanding", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-non-image-absolutize-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
modelProvider: "test-provider",
|
||||
model: "vision-model",
|
||||
};
|
||||
mockState.modelCatalog = [
|
||||
{
|
||||
provider: "test-provider",
|
||||
id: "vision-model",
|
||||
name: "Vision model",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
];
|
||||
mockState.savedMediaResults = [
|
||||
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
|
||||
];
|
||||
mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
|
||||
mockState.stagedRelativePaths = ["media/inbound/report.pdf"];
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-non-image-absolutize",
|
||||
message: "read this",
|
||||
requestParams: {
|
||||
attachments: [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "report.pdf",
|
||||
content: pdf,
|
||||
},
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["media/inbound/report.pdf"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaPath).toBe("media/inbound/report.pdf");
|
||||
expect(mockState.lastDispatchCtx?.MediaWorkspaceDir).toBe("/sandbox/workspace");
|
||||
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
|
||||
});
|
||||
|
||||
it("wraps stageSandboxMedia infrastructure errors as 5xx UNAVAILABLE and cleans up media-store files", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-stage-unavailable-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
modelProvider: "test-provider",
|
||||
model: "vision-model",
|
||||
};
|
||||
mockState.modelCatalog = [
|
||||
{
|
||||
provider: "test-provider",
|
||||
id: "vision-model",
|
||||
name: "Vision model",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
];
|
||||
mockState.savedMediaResults = [
|
||||
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
|
||||
];
|
||||
mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
|
||||
mockState.stageSandboxMediaError = Object.assign(new Error("ENOSPC: no space left on device"), {
|
||||
code: "ENOSPC",
|
||||
});
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-stage-unavailable",
|
||||
message: "read this",
|
||||
requestParams: {
|
||||
attachments: [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "report.pdf",
|
||||
content: pdf,
|
||||
},
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
waitFor: "none",
|
||||
});
|
||||
|
||||
// Plain Error from stageSandboxMedia would be misclassified as INVALID_REQUEST
|
||||
// by the outer catch. Wrapping it in MediaOffloadError routes it to UNAVAILABLE
|
||||
// so the client retries instead of treating it as a bad request.
|
||||
expect(mockState.lastDispatchCtx).toBeUndefined();
|
||||
expect(respond).toHaveBeenCalledTimes(1);
|
||||
const [ok, payload, error] = respond.mock.calls[0] ?? [];
|
||||
expect(ok).toBe(false);
|
||||
expect(payload).toBeUndefined();
|
||||
expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
|
||||
expect(error?.message ?? String(error)).toMatch(/ENOSPC|non-image attachments/i);
|
||||
// Orphaned media-store files are cleaned up before the 5xx surfaces.
|
||||
expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
|
||||
});
|
||||
|
||||
it("surfaces partial non-image staging failures as 5xx UNAVAILABLE", async () => {
|
||||
// Regression: stageSandboxMedia keeps unstaged entries as their original
|
||||
// absolute path, so a simple `stagedPaths.length === nonImage.length`
|
||||
// check could not detect when one of the files silently fell out (e.g. a
|
||||
// file between the RPC cap and the staging cap). Prestage must compare
|
||||
// the returned `staged` map against the input refs.
|
||||
createTranscriptFixture("openclaw-chat-send-partial-stage-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
modelProvider: "test-provider",
|
||||
model: "vision-model",
|
||||
};
|
||||
mockState.modelCatalog = [
|
||||
{
|
||||
provider: "test-provider",
|
||||
id: "vision-model",
|
||||
name: "Vision model",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
];
|
||||
mockState.savedMediaResults = [
|
||||
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
|
||||
{ path: "/home/user/.openclaw/media/inbound/oversize.pdf", contentType: "application/pdf" },
|
||||
];
|
||||
mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
|
||||
mockState.stagedRelativePaths = ["media/inbound/report.pdf", "media/inbound/oversize.pdf"];
|
||||
mockState.unstagedSources = ["/home/user/.openclaw/media/inbound/oversize.pdf"];
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-partial-stage",
|
||||
message: "read these",
|
||||
requestParams: {
|
||||
attachments: [
|
||||
{ type: "file", mimeType: "application/pdf", fileName: "report.pdf", content: pdf },
|
||||
{ type: "file", mimeType: "application/pdf", fileName: "oversize.pdf", content: pdf },
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
waitFor: "none",
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchCtx).toBeUndefined();
|
||||
expect(respond).toHaveBeenCalledTimes(1);
|
||||
const [ok, payload, error] = respond.mock.calls[0] ?? [];
|
||||
expect(ok).toBe(false);
|
||||
expect(payload).toBeUndefined();
|
||||
expect(error?.code).toBe(ErrorCodes.UNAVAILABLE);
|
||||
expect(error?.message ?? String(error)).toMatch(/staging incomplete/i);
|
||||
// Both media-store entries are cleaned up before the 5xx surfaces.
|
||||
expect(mockState.deleteMediaBufferCalls.map((c) => c.id).toSorted()).toEqual([
|
||||
"saved-media",
|
||||
"saved-media",
|
||||
]);
|
||||
});
|
||||
|
||||
it("rejects sandbox-oversized non-image attachments as 4xx before staging", async () => {
|
||||
// Regression: resolveChatAttachmentMaxBytes defaults to 20MB, but
|
||||
// stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (5MB) and
|
||||
// silently drops oversize files. Without a pre-check, a sandbox session
|
||||
// accepting a 5-20MB non-image would fail staging and surface as a
|
||||
// retryable 5xx UNAVAILABLE, misleading clients into retrying a
|
||||
// deterministically broken request.
|
||||
createTranscriptFixture("openclaw-chat-send-sandbox-oversize-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
modelProvider: "test-provider",
|
||||
model: "vision-model",
|
||||
};
|
||||
mockState.modelCatalog = [
|
||||
{
|
||||
provider: "test-provider",
|
||||
id: "vision-model",
|
||||
name: "Vision model",
|
||||
input: ["text", "image"],
|
||||
},
|
||||
];
|
||||
mockState.savedMediaResults = [
|
||||
{ path: "/home/user/.openclaw/media/inbound/huge.pdf", contentType: "application/pdf" },
|
||||
];
|
||||
mockState.sandboxWorkspace = { workspaceDir: "/sandbox/workspace" };
|
||||
const respond = vi.fn();
|
||||
const context = createChatContext();
|
||||
// 6MB buffer — above STAGED_MEDIA_MAX_BYTES (5MB) but below the 20MB parse cap.
|
||||
const oversized = Buffer.alloc(6 * 1024 * 1024);
|
||||
oversized.set(Buffer.from("%PDF-1.4\n"), 0);
|
||||
const pdf = oversized.toString("base64");
|
||||
|
||||
await runNonStreamingChatSend({
|
||||
context,
|
||||
respond,
|
||||
idempotencyKey: "idem-sandbox-oversize",
|
||||
message: "read this",
|
||||
requestParams: {
|
||||
attachments: [
|
||||
{ type: "file", mimeType: "application/pdf", fileName: "huge.pdf", content: pdf },
|
||||
],
|
||||
},
|
||||
expectBroadcast: false,
|
||||
waitFor: "none",
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchCtx).toBeUndefined();
|
||||
expect(respond).toHaveBeenCalledTimes(1);
|
||||
const [ok, payload, error] = respond.mock.calls[0] ?? [];
|
||||
expect(ok).toBe(false);
|
||||
expect(payload).toBeUndefined();
|
||||
// 4xx, not 5xx — retrying a file that exceeds the staging cap cannot
|
||||
// succeed, so the failure must be surfaced as a client-side rejection.
|
||||
expect(error?.code).toBe(ErrorCodes.INVALID_REQUEST);
|
||||
expect(error?.message ?? String(error)).toMatch(/sandbox staging limit/i);
|
||||
// Orphaned media-store entries are cleaned up before the 4xx surfaces.
|
||||
expect(mockState.deleteMediaBufferCalls).toEqual([{ id: "saved-media", subdir: "inbound" }]);
|
||||
});
|
||||
|
||||
it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-image-order-");
|
||||
mockState.finalText = "ok";
|
||||
|
||||
@@ -2,16 +2,20 @@ import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
|
||||
import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
|
||||
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
|
||||
import { resolveAgentWorkspaceDir, resolveSessionAgentId } from "../../agents/agent-scope.js";
|
||||
import { resolveThinkingDefault } from "../../agents/model-selection.js";
|
||||
import { rewriteTranscriptEntriesInSessionFile } from "../../agents/pi-embedded-runner/transcript-rewrite.js";
|
||||
import { ensureSandboxWorkspaceForSession } from "../../agents/sandbox/context.js";
|
||||
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
|
||||
import { dispatchInboundMessage } from "../../auto-reply/dispatch.js";
|
||||
import type { ReplyPayload } from "../../auto-reply/reply-payload.js";
|
||||
import { createReplyDispatcher } from "../../auto-reply/reply/reply-dispatcher.js";
|
||||
import type { MsgContext } from "../../auto-reply/templating.js";
|
||||
import { stageSandboxMedia } from "../../auto-reply/reply/stage-sandbox-media.js";
|
||||
import type { MsgContext, TemplateContext } from "../../auto-reply/templating.js";
|
||||
import { extractCanvasFromText } from "../../chat/canvas-render.js";
|
||||
import { resolveSessionFilePath } from "../../config/sessions.js";
|
||||
import type { OpenClawConfig } from "../../config/types.openclaw.js";
|
||||
import { formatErrorMessage } from "../../infra/errors.js";
|
||||
import { jsonUtf8Bytes } from "../../infra/json-utf8-bytes.js";
|
||||
import { normalizeReplyPayloadsForDelivery } from "../../infra/outbound/payloads.js";
|
||||
import { getSessionBindingService } from "../../infra/outbound/session-binding-service.js";
|
||||
@@ -22,7 +26,12 @@ import {
|
||||
} from "../../media/local-roots.js";
|
||||
import { isAudioFileName } from "../../media/mime.js";
|
||||
import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js";
|
||||
import { type SavedMedia, saveMediaBuffer } from "../../media/store.js";
|
||||
import {
|
||||
deleteMediaBuffer,
|
||||
MEDIA_MAX_BYTES,
|
||||
type SavedMedia,
|
||||
saveMediaBuffer,
|
||||
} from "../../media/store.js";
|
||||
import { createChannelReplyPipeline } from "../../plugin-sdk/channel-reply-pipeline.js";
|
||||
import { isPluginOwnedSessionBindingRecord } from "../../plugins/conversation-binding.js";
|
||||
import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js";
|
||||
@@ -48,10 +57,12 @@ import {
|
||||
} from "../chat-abort.js";
|
||||
import {
|
||||
type ChatImageContent,
|
||||
MediaOffloadError,
|
||||
type OffloadedRef,
|
||||
parseMessageWithAttachments,
|
||||
resolveChatAttachmentMaxBytes,
|
||||
UnsupportedAttachmentError,
|
||||
} from "../chat-attachments.js";
|
||||
import { MediaOffloadError } from "../chat-attachments.js";
|
||||
import {
|
||||
isToolHistoryBlockType,
|
||||
projectChatDisplayMessage,
|
||||
@@ -712,14 +723,28 @@ async function persistChatSendImages(params: {
|
||||
);
|
||||
}
|
||||
}
|
||||
const offloadedSaved = params.offloadedRefs.map((ref) => ({
|
||||
id: ref.id,
|
||||
path: ref.path,
|
||||
size: 0,
|
||||
contentType: ref.mimeType,
|
||||
}));
|
||||
// imageOrder now only tracks image slots (see chat-attachments.ts), so split
|
||||
// offloaded refs by mime: image offloads interleave with inline images via
|
||||
// imageOrder, and non-image offloads append to the transcript tail. Without
|
||||
// this split a non-image file would consume the next image slot whenever
|
||||
// both kinds appear in the same request.
|
||||
const imageOffloadedSaved: SavedMedia[] = [];
|
||||
const nonImageOffloadedSaved: SavedMedia[] = [];
|
||||
for (const ref of params.offloadedRefs) {
|
||||
const entry: SavedMedia = {
|
||||
id: ref.id,
|
||||
path: ref.path,
|
||||
size: 0,
|
||||
contentType: ref.mimeType,
|
||||
};
|
||||
if (ref.mimeType.startsWith("image/")) {
|
||||
imageOffloadedSaved.push(entry);
|
||||
} else {
|
||||
nonImageOffloadedSaved.push(entry);
|
||||
}
|
||||
}
|
||||
if (params.imageOrder.length === 0) {
|
||||
return [...inlineSaved, ...offloadedSaved];
|
||||
return [...inlineSaved, ...imageOffloadedSaved, ...nonImageOffloadedSaved];
|
||||
}
|
||||
const saved: SavedMedia[] = [];
|
||||
let inlineIndex = 0;
|
||||
@@ -732,7 +757,7 @@ async function persistChatSendImages(params: {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const offloaded = offloadedSaved[offloadedIndex++];
|
||||
const offloaded = imageOffloadedSaved[offloadedIndex++];
|
||||
if (offloaded) {
|
||||
saved.push(offloaded);
|
||||
}
|
||||
@@ -743,12 +768,15 @@ async function persistChatSendImages(params: {
|
||||
saved.push(inline);
|
||||
}
|
||||
}
|
||||
for (; offloadedIndex < offloadedSaved.length; offloadedIndex++) {
|
||||
const offloaded = offloadedSaved[offloadedIndex];
|
||||
for (; offloadedIndex < imageOffloadedSaved.length; offloadedIndex++) {
|
||||
const offloaded = imageOffloadedSaved[offloadedIndex];
|
||||
if (offloaded) {
|
||||
saved.push(offloaded);
|
||||
}
|
||||
}
|
||||
for (const offloaded of nonImageOffloadedSaved) {
|
||||
saved.push(offloaded);
|
||||
}
|
||||
return saved;
|
||||
}
|
||||
|
||||
@@ -766,6 +794,115 @@ function buildChatSendTranscriptMessage(params: {
|
||||
};
|
||||
}
|
||||
|
||||
// Stages non-image offloads into the agent sandbox synchronously so chat.send
|
||||
// can surface 5xx before respond(). Throws MediaOffloadError on any staging
|
||||
// failure (ENOSPC / EPERM / partial-stage) so the outer chat.send handler can
|
||||
// map it to UNAVAILABLE (5xx); plain Error would be misclassified as 4xx. All
|
||||
// offloaded refs are cleaned up from the media store before rethrow.
|
||||
// Callers MUST set ctx.MediaStaged=true when this runs so the dispatch
|
||||
// pipeline skips its own stageSandboxMedia pass.
|
||||
//
|
||||
// Returned paths are ABSOLUTE (pointing into the sandbox workspace when sandbox
|
||||
// is enabled, or the media-store origin when it is not). applyMediaUnderstanding
|
||||
// runs before any further staging in get-reply.ts and uses
|
||||
// `path.isAbsolute(raw) ? raw : path.resolve(raw)` against the gateway CWD, so
|
||||
// any relative path here would make media-understanding target the wrong host
|
||||
// path and silently skip file analysis.
|
||||
async function prestageNonImageOffloads(params: {
|
||||
offloadedRefs: OffloadedRef[];
|
||||
cfg: OpenClawConfig;
|
||||
sessionKey: string;
|
||||
agentId: string;
|
||||
}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> {
|
||||
const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/"));
|
||||
if (nonImage.length === 0) {
|
||||
return { paths: [], types: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const workspaceDir = resolveAgentWorkspaceDir(params.cfg, params.agentId);
|
||||
const sandbox = await ensureSandboxWorkspaceForSession({
|
||||
config: params.cfg,
|
||||
sessionKey: params.sessionKey,
|
||||
workspaceDir,
|
||||
});
|
||||
if (!sandbox) {
|
||||
return {
|
||||
paths: nonImage.map((ref) => ref.path),
|
||||
types: nonImage.map((ref) => ref.mimeType),
|
||||
};
|
||||
}
|
||||
|
||||
// stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (=
|
||||
// MEDIA_MAX_BYTES, 5MB) and silently skips oversized files. The parse cap
|
||||
// (resolveChatAttachmentMaxBytes, default 20MB) is higher, so a sandboxed
|
||||
// session receiving a non-image file between the two caps would otherwise
|
||||
// pass parse, fail staging, and surface as a retryable 5xx even though
|
||||
// retry cannot succeed. Reject here as a client-side 4xx instead.
|
||||
const oversizedForSandbox = nonImage.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
|
||||
if (oversizedForSandbox.length > 0) {
|
||||
const details = oversizedForSandbox
|
||||
.map((ref) => `${ref.label} (${ref.sizeBytes} bytes)`)
|
||||
.join(", ");
|
||||
throw new UnsupportedAttachmentError(
|
||||
"non-image-too-large-for-sandbox",
|
||||
`non-image attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
|
||||
);
|
||||
}
|
||||
|
||||
const stagingCtx: MsgContext = {
|
||||
MediaPath: nonImage[0].path,
|
||||
MediaPaths: nonImage.map((ref) => ref.path),
|
||||
MediaType: nonImage[0].mimeType,
|
||||
MediaTypes: nonImage.map((ref) => ref.mimeType),
|
||||
};
|
||||
const stageResult = await stageSandboxMedia({
|
||||
ctx: stagingCtx,
|
||||
sessionCtx: stagingCtx as TemplateContext,
|
||||
cfg: params.cfg,
|
||||
sessionKey: params.sessionKey,
|
||||
workspaceDir,
|
||||
});
|
||||
|
||||
// stageSandboxMedia silently keeps unstaged entries as their original
|
||||
// absolute path, so length parity with `nonImage` does not prove every
|
||||
// file landed in the sandbox. The RPC max (20MB via
|
||||
// resolveChatAttachmentMaxBytes) admits files above the staging cap
|
||||
// (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
|
||||
// missing source becomes a 5xx MediaOffloadError the client can retry.
|
||||
const stagedSources = stageResult.staged;
|
||||
const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
|
||||
);
|
||||
}
|
||||
const stagedPaths = stagingCtx.MediaPaths ?? [];
|
||||
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
|
||||
|
||||
// Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
|
||||
// agent inside the container can read them. Host-side media-understanding
|
||||
// resolves them via ctx.MediaWorkspaceDir, which we carry separately.
|
||||
return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
|
||||
} catch (err) {
|
||||
await Promise.allSettled(
|
||||
params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")),
|
||||
);
|
||||
if (err instanceof MediaOffloadError) {
|
||||
throw err;
|
||||
}
|
||||
// Sandbox-oversize rejections are client-side 4xx (see check above). Wrapping
|
||||
// them as MediaOffloadError would misclassify them as retryable 5xx.
|
||||
if (err instanceof UnsupportedAttachmentError) {
|
||||
throw err;
|
||||
}
|
||||
throw new MediaOffloadError(
|
||||
`[Gateway Error] Failed to stage non-image attachments into agent workspace: ${formatErrorMessage(err)}`,
|
||||
{ cause: err },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function resolveChatSendTranscriptMediaFields(savedImages: SavedMedia[]) {
|
||||
const mediaPaths = savedImages.map((entry) => entry.path);
|
||||
if (mediaPaths.length === 0) {
|
||||
@@ -1759,6 +1896,9 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
let parsedImages: ChatImageContent[] = [];
|
||||
let imageOrder: PromptImageOrderEntry[] = [];
|
||||
let offloadedRefs: OffloadedRef[] = [];
|
||||
let nonImageMediaPaths: string[] = [];
|
||||
let nonImageMediaTypes: string[] = [];
|
||||
let nonImageMediaWorkspaceDir: string | undefined;
|
||||
const timeoutMs = resolveAgentTimeoutMs({
|
||||
cfg,
|
||||
overrideMs: p.timeoutMs,
|
||||
@@ -1833,14 +1973,27 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
explicitOriginTargetsPlugin;
|
||||
try {
|
||||
const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, {
|
||||
maxBytes: 5_000_000,
|
||||
maxBytes: resolveChatAttachmentMaxBytes(cfg),
|
||||
log: context.logGateway,
|
||||
supportsImages,
|
||||
// chat.send routes non-image offloadedRefs into ctx.MediaPaths below
|
||||
// so the auto-reply stage pipeline can surface them to the agent.
|
||||
acceptNonImage: true,
|
||||
});
|
||||
parsedMessage = parsed.message;
|
||||
parsedImages = parsed.images;
|
||||
imageOrder = parsed.imageOrder;
|
||||
offloadedRefs = parsed.offloadedRefs;
|
||||
({
|
||||
paths: nonImageMediaPaths,
|
||||
types: nonImageMediaTypes,
|
||||
workspaceDir: nonImageMediaWorkspaceDir,
|
||||
} = await prestageNonImageOffloads({
|
||||
offloadedRefs,
|
||||
cfg,
|
||||
sessionKey,
|
||||
agentId,
|
||||
}));
|
||||
} catch (err) {
|
||||
respond(
|
||||
false,
|
||||
@@ -1947,6 +2100,19 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
GatewayClientScopes: client?.connect?.scopes ?? [],
|
||||
...pluginBoundMediaFields,
|
||||
};
|
||||
if (nonImageMediaPaths.length > 0) {
|
||||
// Inject non-image offloads via the same MsgContext fields the channel
|
||||
// path uses so buildInboundMediaNote renders a real `[media attached:
|
||||
// <workspace-relative-path>]` line into the agent prompt. Marker
|
||||
// blocks the dispatch pipeline from re-running stageSandboxMedia; see
|
||||
// prestageNonImageOffloads.
|
||||
ctx.MediaPath = nonImageMediaPaths[0];
|
||||
ctx.MediaPaths = nonImageMediaPaths;
|
||||
ctx.MediaType = nonImageMediaTypes[0];
|
||||
ctx.MediaTypes = nonImageMediaTypes;
|
||||
ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir;
|
||||
ctx.MediaStaged = true;
|
||||
}
|
||||
|
||||
const { onModelSelected, ...replyPipeline } = createChannelReplyPipeline({
|
||||
cfg,
|
||||
|
||||
@@ -15,7 +15,7 @@ export { enqueueSystemEvent } from "../infra/system-events.js";
|
||||
export { deleteMediaBuffer } from "../media/store.js";
|
||||
export { normalizeMainKey, scopedHeartbeatWakeOptions } from "../routing/session-key.js";
|
||||
export { defaultRuntime } from "../runtime.js";
|
||||
export { parseMessageWithAttachments } from "./chat-attachments.js";
|
||||
export { parseMessageWithAttachments, resolveChatAttachmentMaxBytes } from "./chat-attachments.js";
|
||||
export { normalizeRpcAttachmentsToChatAttachments } from "./server-methods/attachment-normalize.js";
|
||||
export {
|
||||
loadSessionEntry,
|
||||
|
||||
@@ -90,6 +90,7 @@ const runtimeMocks = vi.hoisted(() => ({
|
||||
parseMessageWithAttachments: parseMessageWithAttachmentsMock,
|
||||
registerApnsRegistration: registerApnsRegistrationMock,
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
resolveChatAttachmentMaxBytes: vi.fn(() => 20 * 1024 * 1024),
|
||||
resolveGatewayModelSupportsImages: vi.fn(
|
||||
async ({
|
||||
loadGatewayModelCatalog,
|
||||
@@ -923,7 +924,7 @@ describe("agent request events", () => {
|
||||
expect(opts.runId).toBe(opts.sessionId);
|
||||
});
|
||||
|
||||
it("passes supportsImages false for text-only node-session models", async () => {
|
||||
it("passes supportsInlineImages false for text-only node-session models", async () => {
|
||||
const ctx = buildCtx();
|
||||
ctx.loadGatewayModelCatalog = async () => [
|
||||
{
|
||||
@@ -960,7 +961,41 @@ describe("agent request events", () => {
|
||||
expect(parseMessageWithAttachmentsMock).toHaveBeenCalledWith(
|
||||
"describe",
|
||||
expect.any(Array),
|
||||
expect.objectContaining({ supportsImages: false }),
|
||||
expect.objectContaining({ supportsInlineImages: false }),
|
||||
);
|
||||
});
|
||||
|
||||
it("declines non-image attachments cleanly when parse throws UnsupportedAttachmentError", async () => {
|
||||
const warn = vi.fn();
|
||||
const ctx = buildCtx();
|
||||
ctx.logGateway = { warn };
|
||||
|
||||
parseMessageWithAttachmentsMock.mockRejectedValueOnce(
|
||||
Object.assign(new Error("attachment a.pdf: non-image attachments not supported"), {
|
||||
name: "UnsupportedAttachmentError",
|
||||
reason: "unsupported-non-image",
|
||||
}),
|
||||
);
|
||||
|
||||
await handleNodeEvent(ctx, "node-non-image-refusal", {
|
||||
event: "agent.request",
|
||||
payloadJSON: JSON.stringify({
|
||||
message: "read this",
|
||||
sessionKey: "agent:main:main",
|
||||
attachments: [
|
||||
{
|
||||
type: "file",
|
||||
mimeType: "application/pdf",
|
||||
fileName: "a.pdf",
|
||||
content: "JVBERi0=",
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
// server-node-events must log-and-return on parse failure — no agent
|
||||
// dispatch, no crash, and the refusal reason bubbles up via logGateway.
|
||||
expect(agentCommandMock).not.toHaveBeenCalled();
|
||||
expect(warn).toHaveBeenCalledWith(expect.stringMatching(/attachment parse failed.*non-image/i));
|
||||
});
|
||||
});
|
||||
|
||||
@@ -26,6 +26,7 @@ import {
|
||||
parseMessageWithAttachments,
|
||||
registerApnsRegistration,
|
||||
requestHeartbeatNow,
|
||||
resolveChatAttachmentMaxBytes,
|
||||
resolveGatewayModelSupportsImages,
|
||||
resolveOutboundTarget,
|
||||
resolveSessionAgentId,
|
||||
@@ -427,16 +428,20 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt
|
||||
if (normalizedAttachments.length > 0) {
|
||||
const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg });
|
||||
const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId);
|
||||
const supportsImages = await resolveGatewayModelSupportsImages({
|
||||
const supportsInlineImages = await resolveGatewayModelSupportsImages({
|
||||
loadGatewayModelCatalog: ctx.loadGatewayModelCatalog,
|
||||
provider: modelRef.provider,
|
||||
model: modelRef.model,
|
||||
});
|
||||
try {
|
||||
const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
|
||||
maxBytes: 5_000_000,
|
||||
maxBytes: resolveChatAttachmentMaxBytes(cfg),
|
||||
log: ctx.logGateway,
|
||||
supportsImages,
|
||||
supportsInlineImages,
|
||||
// server-node-events dispatches via agentCommandFromIngress which
|
||||
// has no ctx.MediaPaths wiring; reject non-image attachments
|
||||
// explicitly rather than saving them where the agent cannot reach them.
|
||||
acceptNonImage: false,
|
||||
});
|
||||
message = parsed.message.trim();
|
||||
images = parsed.images;
|
||||
|
||||
@@ -536,6 +536,7 @@ export async function applyMediaUnderstanding(params: {
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
|
||||
ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
workspaceDir: ctx.MediaWorkspaceDir,
|
||||
});
|
||||
|
||||
try {
|
||||
|
||||
@@ -53,6 +53,7 @@ export type MediaAttachmentCacheOptions = {
|
||||
localPathRoots?: readonly string[];
|
||||
includeDefaultLocalPathRoots?: boolean;
|
||||
ssrfPolicy?: SsrFPolicy;
|
||||
workspaceDir?: string;
|
||||
};
|
||||
|
||||
function resolveRequestUrl(input: RequestInfo | URL): string {
|
||||
@@ -70,6 +71,7 @@ export class MediaAttachmentCache {
|
||||
private readonly attachments: MediaAttachment[];
|
||||
private readonly localPathRoots: readonly string[];
|
||||
private readonly ssrfPolicy: SsrFPolicy | undefined;
|
||||
private readonly workspaceDir?: string;
|
||||
private canonicalLocalPathRoots?: Promise<readonly string[]>;
|
||||
|
||||
constructor(attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions) {
|
||||
@@ -79,6 +81,7 @@ export class MediaAttachmentCache {
|
||||
options?.includeDefaultLocalPathRoots === false
|
||||
? mergeInboundPathRoots(options.localPathRoots)
|
||||
: mergeInboundPathRoots(options?.localPathRoots, getDefaultLocalPathRoots());
|
||||
this.workspaceDir = options?.workspaceDir ? path.resolve(options.workspaceDir) : undefined;
|
||||
for (const attachment of attachments) {
|
||||
this.entries.set(attachment.index, { attachment });
|
||||
}
|
||||
@@ -293,7 +296,7 @@ export class MediaAttachmentCache {
|
||||
if (!rawPath) {
|
||||
return undefined;
|
||||
}
|
||||
return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||
return this.workspaceDir ? path.resolve(this.workspaceDir, rawPath) : path.resolve(rawPath);
|
||||
}
|
||||
|
||||
private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
|
||||
|
||||
@@ -97,6 +97,23 @@ describe("media understanding attachments SSRF", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("resolves relative attachment paths against the provided workspaceDir", async () => {
|
||||
await withTempDir({ prefix: "openclaw-media-cache-workspace-" }, async (base) => {
|
||||
const workspaceDir = path.join(base, "workspace");
|
||||
const attachmentPath = path.join(workspaceDir, "media", "inbound", "report.pdf");
|
||||
await fs.mkdir(path.dirname(attachmentPath), { recursive: true });
|
||||
await fs.writeFile(attachmentPath, "ok");
|
||||
|
||||
const cache = new MediaAttachmentCache([{ index: 0, path: "media/inbound/report.pdf" }], {
|
||||
localPathRoots: [workspaceDir],
|
||||
workspaceDir,
|
||||
});
|
||||
|
||||
const result = await cache.getBuffer({ attachmentIndex: 0, maxBytes: 1024, timeoutMs: 1000 });
|
||||
expect(result.buffer.toString()).toBe("ok");
|
||||
});
|
||||
});
|
||||
|
||||
it("blocks local attachments outside configured roots", async () => {
|
||||
if (process.platform === "win32") {
|
||||
return;
|
||||
|
||||
@@ -270,8 +270,15 @@ export function resolveMediaAttachmentLocalRoots(params: {
|
||||
cfg: OpenClawConfig;
|
||||
ctx: MsgContext;
|
||||
}): readonly string[] {
|
||||
// ctx.MediaWorkspaceDir is set by chat.send's prestageNonImageOffloads when
|
||||
// inbound attachments were staged into a sandbox workspace. The paths in
|
||||
// ctx.MediaPaths are kept sandbox-relative (so the agent inside the
|
||||
// container can read them), and the workspace dir is carried separately so
|
||||
// host-side media-understanding can still resolve them via this root list.
|
||||
const workspaceDir = params.ctx.MediaWorkspaceDir;
|
||||
return mergeInboundPathRoots(
|
||||
getDefaultMediaLocalRoots(),
|
||||
workspaceDir ? [path.resolve(workspaceDir)] : undefined,
|
||||
resolveChannelInboundAttachmentRoots(params),
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user