feat(gateway): offload non-image attachments on chat.send

Signed-off-by: samzong <samzong.lu@gmail.com>
This commit is contained in:
samzong
2026-04-16 15:39:45 +08:00
committed by Frank Yang
parent a68cc94c36
commit 4c61d307da
9 changed files with 561 additions and 353 deletions

View File

@@ -613,7 +613,11 @@ export async function getReplyFromConfig(
}
}
if (!useFastTestBootstrap && sessionKey && hasInboundMedia(ctx)) {
// ctx.MediaStaged=true means the caller (e.g. chat.send RPC) already staged
// synchronously so it could surface 5xx before respond(). Skipping here keeps
// staging a single-call contract instead of relying on relative-path no-op
// semantics in stageSandboxMedia.
if (!useFastTestBootstrap && sessionKey && !ctx.MediaStaged && hasInboundMedia(ctx)) {
const { stageSandboxMedia } = await loadStageSandboxMediaRuntime();
await stageSandboxMedia({
ctx,

View File

@@ -123,8 +123,15 @@ export type MsgContext = {
MediaPaths?: string[];
MediaUrls?: string[];
MediaTypes?: string[];
MediaWorkspaceDir?: string;
/** Attachment indexes whose audio was already transcribed before media understanding runs. */
MediaTranscribedIndexes?: number[];
/**
* Marker: skip downstream stageSandboxMedia. chat.send RPC sets this so
* staging runs synchronously before respond() and surfaces 5xx to the
* client; any later failure only reaches the broadcast channel.
*/
MediaStaged?: boolean;
/** Telegram sticker metadata (emoji, set name, file IDs, cached description). */
Sticker?: StickerContextMetadata;
/** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */

View File

@@ -1,26 +1,59 @@
import { describe, expect, it, vi } from "vitest";
import { deleteMediaBuffer } from "../media/store.js";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
const saveMediaBufferMock = vi.hoisted(() =>
vi.fn(async (_buffer: Buffer, mime?: string, _subdir?: string) => ({
id: `fake-id-${Math.random().toString(36).slice(2, 10)}`,
path: `/tmp/openclaw-test-media/inbound/fake.${mime?.split("/")[1] ?? "bin"}`,
size: 0,
contentType: mime,
})),
);
const deleteMediaBufferMock = vi.hoisted(() => vi.fn(async () => undefined));
vi.mock("../media/store.js", async (importOriginal) => {
const actual = (await importOriginal()) as Record<string, unknown>;
return {
...actual,
saveMediaBuffer: saveMediaBufferMock,
deleteMediaBuffer: deleteMediaBufferMock,
};
});
import {
buildMessageWithAttachments,
type ChatAttachment,
parseMessageWithAttachments,
UnsupportedAttachmentError,
} from "./chat-attachments.js";
const PNG_1x1 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
async function parseWithWarnings(message: string, attachments: ChatAttachment[]) {
async function parseWithWarnings(
message: string,
attachments: ChatAttachment[],
opts: Parameters<typeof parseMessageWithAttachments>[2] = {},
) {
const logs: string[] = [];
const parsed = await parseMessageWithAttachments(message, attachments, {
log: { warn: (warning) => logs.push(warning) },
...opts,
});
return { parsed, logs };
}
async function cleanupOffloadedRefs(refs: { id: string }[]) {
await Promise.allSettled(refs.map((ref) => deleteMediaBuffer(ref.id, "inbound")));
await Promise.allSettled(refs.map((ref) => deleteMediaBufferMock(ref.id, "inbound")));
}
beforeEach(() => {
saveMediaBufferMock.mockClear();
deleteMediaBufferMock.mockClear();
});
afterEach(() => {
vi.restoreAllMocks();
});
describe("buildMessageWithAttachments", () => {
it("embeds a single image as data URL", () => {
const msg = buildMessageWithAttachments("see this", [
@@ -81,19 +114,43 @@ describe("parseMessageWithAttachments", () => {
expect(logs).toHaveLength(0);
});
it("drops non-image payloads and logs", async () => {
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
const { parsed, logs } = await parseWithWarnings("x", [
it("accepts non-image payloads and offloads them via the media store", async () => {
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
const { parsed, logs } = await parseWithWarnings("read this", [
{
type: "file",
mimeType: "image/png",
fileName: "not-image.pdf",
mimeType: "application/pdf",
fileName: "report.pdf",
content: pdf,
},
]);
expect(parsed.images).toHaveLength(0);
expect(logs).toHaveLength(1);
expect(logs[0]).toMatch(/non-image/i);
expect(parsed.offloadedRefs).toHaveLength(1);
const ref = parsed.offloadedRefs[0];
expect(ref.mimeType).toBe("application/pdf");
expect(ref.label).toBe("report.pdf");
expect(ref.mediaRef).toMatch(/^media:\/\/inbound\//);
// Non-image offloads MUST NOT inject a media://URI into the message —
// the caller is responsible for routing offloadedRefs[].path into
// ctx.MediaPaths so the workspace stage surfaces a real path.
expect(parsed.message).toBe("read this");
expect(saveMediaBufferMock).toHaveBeenCalledOnce();
expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/pdf");
expect(logs).toHaveLength(0);
});
it("offloads opaque binary when sniff and provided mime are both absent", async () => {
const unknown = Buffer.from("just some bytes that do not match any signature").toString(
"base64",
);
const { parsed, logs } = await parseWithWarnings("take a look", [
{ type: "file", fileName: "blob.dat", content: unknown },
]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/octet-stream");
expect(saveMediaBufferMock.mock.calls[0]?.[1]).toBe("application/octet-stream");
expect(parsed.message).toBe("take a look");
expect(logs).toHaveLength(0);
});
it("prefers sniffed mime type and logs mismatch", async () => {
@@ -111,28 +168,9 @@ describe("parseMessageWithAttachments", () => {
expect(logs[0]).toMatch(/mime mismatch/i);
});
it("persists unknown non-image files when sniff fails", async () => {
const unknown = Buffer.from("not an image").toString("base64");
const { parsed, logs } = await parseWithWarnings("x", [
{ type: "file", fileName: "unknown.bin", content: unknown },
]);
try {
expect(parsed.images).toHaveLength(0);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]).toMatchObject({
label: "unknown.bin",
mimeType: "application/octet-stream",
});
expect(parsed.message).toMatch(/^x\n\[media attached: media:\/\/inbound\//);
expect(logs).toHaveLength(0);
} finally {
await cleanupOffloadedRefs(parsed.offloadedRefs);
}
});
it("keeps valid images and drops invalid ones", async () => {
it("keeps image inline and offloads non-image side by side", async () => {
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
const { parsed, logs } = await parseWithWarnings("x", [
const { parsed } = await parseWithWarnings("x", [
{
type: "image",
mimeType: "image/png",
@@ -141,15 +179,130 @@ describe("parseMessageWithAttachments", () => {
},
{
type: "file",
mimeType: "image/png",
fileName: "not-image.pdf",
mimeType: "application/pdf",
fileName: "report.pdf",
content: pdf,
},
]);
expect(parsed.images).toHaveLength(1);
expect(parsed.images[0]?.mimeType).toBe("image/png");
expect(parsed.images[0]?.data).toBe(PNG_1x1);
expect(logs.some((l) => /non-image/i.test(l))).toBe(true);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
expect(parsed.imageOrder).toEqual(["inline", "offloaded"]);
});
it("preserves specific OOXML mime when sniff returns generic zip (docx)", async () => {
const docx = Buffer.from("PK\u0003\u0004fake-docx-content").toString("base64");
const { parsed } = await parseWithWarnings("x", [
{
type: "file",
mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
fileName: "spec.docx",
content: docx,
},
]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.label).toBe("spec.docx");
// Docx sniffs as application/zip; the provided OOXML mime must win so the
// agent sees the real document type, not a generic archive.
expect(parsed.offloadedRefs[0]?.mimeType).toBe(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
);
});
it("recovers specific mime from filename extension when sniff is generic and provided mime is absent", async () => {
const xlsx = Buffer.from("PK\u0003\u0004fake-xlsx").toString("base64");
const { parsed } = await parseWithWarnings("x", [
{ type: "file", fileName: "sheet.xlsx", content: xlsx },
]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
);
});
it("accepts zip attachments via workspace offload", async () => {
const zip = Buffer.from("PK\u0003\u0004zip-archive-bytes").toString("base64");
const { parsed } = await parseWithWarnings("x", [
{
type: "file",
mimeType: "application/zip",
fileName: "bundle.zip",
content: zip,
},
]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.label).toBe("bundle.zip");
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/zip");
});
});
describe("parseMessageWithAttachments validation errors", () => {
it("throws UnsupportedAttachmentError on empty payload", async () => {
await expect(
parseMessageWithAttachments(
"x",
[{ type: "file", mimeType: "application/pdf", fileName: "empty.pdf", content: "" }],
{ log: { warn: () => {} } },
),
).rejects.toMatchObject({
name: "UnsupportedAttachmentError",
reason: "empty-payload",
});
expect(saveMediaBufferMock).not.toHaveBeenCalled();
});
it("throws UnsupportedAttachmentError on non-image when acceptNonImage is false", async () => {
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
let caught: unknown;
try {
await parseMessageWithAttachments(
"x",
[{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
{ log: { warn: () => {} }, acceptNonImage: false },
);
} catch (err) {
caught = err;
}
expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
expect((caught as UnsupportedAttachmentError).reason).toBe("unsupported-non-image");
expect(saveMediaBufferMock).not.toHaveBeenCalled();
});
it("throws UnsupportedAttachmentError on image when supportsInlineImages is false", async () => {
let caught: unknown;
try {
await parseMessageWithAttachments(
"x",
[{ type: "image", mimeType: "image/png", fileName: "dot.png", content: PNG_1x1 }],
{ log: { warn: () => {} }, supportsInlineImages: false },
);
} catch (err) {
caught = err;
}
expect(caught).toBeInstanceOf(UnsupportedAttachmentError);
expect((caught as UnsupportedAttachmentError).reason).toBe("text-only-image");
expect(saveMediaBufferMock).not.toHaveBeenCalled();
});
it("still offloads non-image attachments when supportsInlineImages is false", async () => {
const pdf = Buffer.from("%PDF-1.4\n").toString("base64");
const { parsed } = await parseWithWarnings(
"x",
[{ type: "file", mimeType: "application/pdf", fileName: "a.pdf", content: pdf }],
{ supportsInlineImages: false },
);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]?.mimeType).toBe("application/pdf");
expect(saveMediaBufferMock).toHaveBeenCalledOnce();
});
it("passes through unchanged on text-only session with no attachments", async () => {
const { parsed } = await parseWithWarnings("hello", [], { supportsInlineImages: false });
expect(parsed.message).toBe("hello");
expect(parsed.images).toHaveLength(0);
expect(parsed.offloadedRefs).toHaveLength(0);
expect(saveMediaBufferMock).not.toHaveBeenCalled();
});
it("persists non-image file attachments as media refs", async () => {
@@ -168,13 +321,13 @@ describe("parseMessageWithAttachments", () => {
try {
expect(parsed.images).toHaveLength(0);
expect(parsed.imageOrder).toEqual(["offloaded"]);
expect(parsed.imageOrder).toEqual([]);
expect(parsed.offloadedRefs).toHaveLength(1);
expect(parsed.offloadedRefs[0]).toMatchObject({
mimeType: "application/pdf",
label: "brief.pdf",
});
expect(parsed.message).toMatch(/^read this\n\[media attached: media:\/\/inbound\//);
expect(parsed.message).toBe("read this");
} finally {
await cleanupOffloadedRefs(parsed.offloadedRefs);
}

View File

@@ -1,12 +1,12 @@
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { formatErrorMessage } from "../infra/errors.js";
import { estimateBase64DecodedBytes } from "../media/base64.js";
import { MAX_IMAGE_BYTES } from "../media/constants.js";
import { extensionForMime, mimeTypeFromFilePath } from "../media/mime.js";
import type { PromptImageOrderEntry } from "../media/prompt-image-order.js";
import { sniffMimeFromBase64 } from "../media/sniff-mime-from-base64.js";
import { deleteMediaBuffer, saveMediaBuffer } from "../media/store.js";
import {
normalizeLowercaseStringOrEmpty,
normalizeOptionalLowercaseString,
} from "../shared/string-coerce.js";
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
export type ChatAttachment = {
type?: string;
@@ -21,47 +21,19 @@ export type ChatImageContent = {
mimeType: string;
};
/**
* Metadata for an attachment that was offloaded to the media store.
*
* Included in ParsedMessageWithImages.offloadedRefs so that callers can
* persist structured media metadata for transcripts. Without this, consumers
* that derive MediaPath/MediaPaths from the `images` array (e.g.
* persistChatSendImages and buildChatSendTranscriptMessage in chat.ts) would
* silently omit all large attachments that were offloaded to disk.
*/
export type OffloadedRef = {
/** Opaque media URI injected into the message, e.g. "media://inbound/<id>" */
mediaRef: string;
/** The raw media ID from SavedMedia.id, usable with resolveMediaBufferPath */
id: string;
/** Absolute filesystem path returned by saveMediaBuffer — used for transcript MediaPath */
path: string;
/** MIME type of the offloaded attachment */
mimeType: string;
/** The label / filename of the original attachment */
label: string;
sizeBytes: number;
};
export type ParsedMessageWithImages = {
message: string;
/** Small attachments (≤ OFFLOAD_THRESHOLD_BYTES) passed inline to the model */
images: ChatImageContent[];
/** Original accepted attachment order after inline/offloaded split. */
imageOrder: PromptImageOrderEntry[];
/**
* Large attachments (> OFFLOAD_THRESHOLD_BYTES) that were offloaded to the
* media store. Each entry corresponds to a `[media attached: media://inbound/<id>]`
* marker appended to `message`.
*
* Callers MUST persist this list separately for transcript media metadata.
* It is intentionally separate from `images` because downstream model calls
* do not receive these as inline image blocks.
*
* ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass
* `supportsImages: modelSupportsImages(model)` so text-only model runs
* offload images as media refs instead of passing inline image blocks.
*/
offloadedRefs: OffloadedRef[];
};
@@ -78,49 +50,38 @@ type NormalizedAttachment = {
type SavedMedia = {
id: string;
path?: string;
path: string;
};
const OFFLOAD_THRESHOLD_BYTES = 2_000_000;
const TEXT_ONLY_OFFLOAD_LIMIT = 10;
const MIME_TO_EXT: Record<string, string> = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"image/heic": ".heic",
"image/heif": ".heif",
// bmp/tiff excluded from SUPPORTED_OFFLOAD_MIMES to avoid extension-loss
// bug in store.ts; entries kept here for future extension support
"image/bmp": ".bmp",
"image/tiff": ".tiff",
};
export const DEFAULT_CHAT_ATTACHMENT_MAX_MB = 20;
// Module-level Set for O(1) lookup — not rebuilt on every attachment iteration.
//
// heic/heif are included only if store.ts's extensionForMime maps them to an
// extension. If it does not (same extension-loss risk as bmp/tiff), remove
// them from this set.
const SUPPORTED_OFFLOAD_MIMES = new Set([
"image/jpeg",
"image/jpg",
"image/png",
"image/webp",
"image/gif",
"image/heic",
"image/heif",
]);
export function resolveChatAttachmentMaxBytes(cfg: OpenClawConfig): number {
const configured = cfg.agents?.defaults?.mediaMaxMb;
const mb =
typeof configured === "number" && Number.isFinite(configured) && configured > 0
? configured
: DEFAULT_CHAT_ATTACHMENT_MAX_MB;
return Math.floor(mb * 1024 * 1024);
}
export type UnsupportedAttachmentReason =
| "empty-payload"
| "text-only-image"
| "unsupported-non-image"
| "non-image-too-large-for-sandbox";
export class UnsupportedAttachmentError extends Error {
readonly reason: UnsupportedAttachmentReason;
constructor(reason: UnsupportedAttachmentReason, message: string) {
super(message);
this.name = "UnsupportedAttachmentError";
this.reason = reason;
}
}
/**
* Raised when the Gateway cannot persist an attachment to the media store.
*
* Distinct from ordinary input-validation errors so that Gateway handlers can
* map it to a server-side 5xx status rather than a client 4xx.
*
* Example causes: ENOSPC, EPERM, unexpected saveMediaBuffer return shape.
*/
export class MediaOffloadError extends Error {
readonly cause: unknown;
constructor(message: string, options?: ErrorOptions) {
@@ -142,42 +103,17 @@ function isImageMime(mime?: string): boolean {
return typeof mime === "string" && mime.startsWith("image/");
}
function isVideoMime(mime?: string): boolean {
return typeof mime === "string" && mime.startsWith("video/");
}
function isGenericMime(mime?: string): boolean {
return (
!mime ||
mime === "application/octet-stream" ||
mime === "binary/octet-stream" ||
mime === "application/unknown"
);
function isGenericContainerMime(mime?: string): boolean {
return mime === "application/zip" || mime === "application/octet-stream";
}
function isValidBase64(value: string): boolean {
if (value.length === 0 || value.length % 4 !== 0) {
return false;
}
// A full O(n) regex scan is safe: no overlapping quantifiers, fails linearly.
// Prevents adversarial payloads padded with megabytes of whitespace from
// bypassing length thresholds.
return /^[A-Za-z0-9+/]+={0,2}$/.test(value);
}
/**
* Confirms that the decoded buffer produced by Buffer.from(b64, 'base64')
* matches the pre-decode size estimate.
*
* Node's Buffer.from silently drops invalid base64 characters rather than
* throwing. A material size discrepancy means the source string contained
* embedded garbage that was silently stripped, which would produce a corrupted
* file on disk. ±3 bytes of leeway accounts for base64 padding rounding.
*
* IMPORTANT: this is an input-validation check (4xx client error).
* It MUST be called OUTSIDE the MediaOffloadError try/catch so that
* corrupt-input errors are not misclassified as 5xx server errors.
*/
function verifyDecodedSize(buffer: Buffer, estimatedBytes: number, label: string): void {
if (Math.abs(buffer.byteLength - estimatedBytes) > 3) {
throw new Error(
@@ -191,40 +127,34 @@ function ensureExtension(label: string, mime: string): string {
if (/\.[a-zA-Z0-9]+$/.test(label)) {
return label;
}
const ext = MIME_TO_EXT[normalizeLowercaseStringOrEmpty(mime)] ?? "";
const ext = extensionForMime(mime) ?? "";
return ext ? `${label}${ext}` : label;
}
/**
* Type guard for the return value of saveMediaBuffer.
*
* Also validates that the returned ID:
* - is a non-empty string
* - contains no path separators (/ or \) or null bytes
*
* Catching a bad shape here produces a cleaner error than a cryptic failure
* deeper in the stack, and is treated as a 5xx infrastructure error.
*/
function assertSavedMedia(value: unknown, label: string): SavedMedia {
if (
value !== null &&
typeof value === "object" &&
"id" in value &&
typeof (value as Record<string, unknown>).id === "string"
value === null ||
typeof value !== "object" ||
!("id" in value) ||
typeof (value as Record<string, unknown>).id !== "string"
) {
const id = (value as Record<string, unknown>).id as string;
if (id.length === 0) {
throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
}
if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
throw new Error(
`attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
`(contains path separator or null byte)`,
);
}
return value as SavedMedia;
throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
}
throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`);
const id = (value as Record<string, unknown>).id as string;
if (id.length === 0) {
throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`);
}
if (id.includes("/") || id.includes("\\") || id.includes("\0")) {
throw new Error(
`attachment ${label}: saveMediaBuffer returned an unsafe media ID ` +
`(contains path separator or null byte)`,
);
}
const path = (value as Record<string, unknown>).path;
if (typeof path !== "string" || path.length === 0) {
throw new Error(`attachment ${label}: saveMediaBuffer returned no on-disk path`);
}
return { id, path };
}
function normalizeAttachment(
@@ -269,47 +199,22 @@ function validateAttachmentBase64OrThrow(
return sizeBytes;
}
/**
* Parse attachments and extract images as structured content blocks.
* Returns the message text, inline image blocks, and offloaded media refs.
*
* ## Offload behaviour
* Attachments whose decoded size exceeds OFFLOAD_THRESHOLD_BYTES are saved to
* disk via saveMediaBuffer and replaced with an opaque `media://inbound/<id>`
* URI appended to the message. The agent resolves these URIs via
* resolveMediaBufferPath before passing them to the model.
*
* ## Transcript metadata
* Callers MUST use `result.offloadedRefs` to persist structured media metadata
* for transcripts. These refs are intentionally excluded from `result.images`
* because they are not passed inline to the model.
*
* ## Text-only model runs
* Pass `supportsImages: false` for text-only model runs so images are offloaded
* as `media://inbound/<id>` refs instead of being sent as inline image blocks.
* The agent runner can then resolve the refs through the normal media path.
*
* ## Cleanup on failure
* On any parse failure after files have already been offloaded, best-effort
* cleanup is performed before rethrowing so that malformed requests do not
* accumulate orphaned files on disk ahead of the periodic TTL sweep.
*
* ## Known ordering limitation
* In mixed large/small batches, the model receives images in a different order
* than the original attachment list because detectAndLoadPromptImages
* initialises from existingImages first, then appends prompt-detected refs.
* A future refactor should unify all image references into a single ordered list.
*
* @throws {MediaOffloadError} Infrastructure failure saving to media store → 5xx.
* @throws {Error} Input validation failure → 4xx.
*/
export async function parseMessageWithAttachments(
message: string,
attachments: ChatAttachment[] | undefined,
opts?: { maxBytes?: number; log?: AttachmentLog; supportsImages?: boolean },
opts?: {
maxBytes?: number;
log?: AttachmentLog;
supportsImages?: boolean;
supportsInlineImages?: boolean;
acceptNonImage?: boolean;
},
): Promise<ParsedMessageWithImages> {
const maxBytes = opts?.maxBytes ?? 5_000_000;
const maxBytes = opts?.maxBytes ?? DEFAULT_CHAT_ATTACHMENT_MAX_MB * 1024 * 1024;
const log = opts?.log;
const shouldForceImageOffload = opts?.supportsImages === false;
const supportsInlineImages = opts?.supportsInlineImages !== false;
const acceptNonImage = opts?.acceptNonImage !== false;
if (!attachments || attachments.length === 0) {
return { message, images: [], imageOrder: [], offloadedRefs: [] };
@@ -319,11 +224,7 @@ export async function parseMessageWithAttachments(
const imageOrder: PromptImageOrderEntry[] = [];
const offloadedRefs: OffloadedRef[] = [];
let updatedMessage = message;
const shouldForceOffload = opts?.supportsImages === false;
let textOnlyImageOffloadCount = 0;
// Track IDs of files saved during this request for cleanup if a later
// attachment fails validation and the entire parse is aborted.
const savedMediaIds: string[] = [];
try {
@@ -339,16 +240,14 @@ export async function parseMessageWithAttachments(
const { base64: b64, label, mime } = normalized;
if (b64.length === 0) {
throw new UnsupportedAttachmentError("empty-payload", `attachment ${label}: empty payload`);
}
if (!isValidBase64(b64)) {
throw new Error(`attachment ${label}: invalid base64 content`);
}
const sizeBytes = estimateBase64DecodedBytes(b64);
if (sizeBytes <= 0) {
log?.warn(`attachment ${label}: estimated size is zero, dropping`);
continue;
}
if (sizeBytes > maxBytes) {
throw new Error(
`attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`,
@@ -357,67 +256,48 @@ export async function parseMessageWithAttachments(
const providedMime = normalizeMime(mime);
const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64));
const labelMime = normalizeMime(mimeTypeFromFilePath(label));
const finalMime =
(sniffedMime && !isGenericContainerMime(sniffedMime) && sniffedMime) ||
(providedMime && !isGenericContainerMime(providedMime) && providedMime) ||
(labelMime && !isGenericContainerMime(labelMime) && labelMime) ||
sniffedMime ||
providedMime ||
labelMime ||
"application/octet-stream";
if (sniffedMime && !isImageMime(sniffedMime) && isImageMime(providedMime)) {
log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`);
continue;
}
const shouldHandleAsImage =
isImageMime(sniffedMime) || (isImageMime(providedMime) && !sniffedMime);
if (!shouldHandleAsImage) {
const finalMime = sniffedMime ?? providedMime ?? "application/octet-stream";
if (isVideoMime(finalMime)) {
log?.warn(`attachment ${label}: video attachments are not supported, dropping`);
continue;
}
const buffer = Buffer.from(b64, "base64");
verifyDecodedSize(buffer, sizeBytes, label);
try {
const rawResult = await saveMediaBuffer(buffer, finalMime, "inbound", maxBytes, label);
const savedMedia = assertSavedMedia(rawResult, label);
savedMediaIds.push(savedMedia.id);
const mediaRef = `media://inbound/${savedMedia.id}`;
updatedMessage += `\n[media attached: ${mediaRef}]`;
log?.info?.(`[Gateway] Saved file attachment. Saved: ${mediaRef}`);
offloadedRefs.push({
mediaRef,
id: savedMedia.id,
path: savedMedia.path ?? "",
mimeType: finalMime,
label,
});
imageOrder.push("offloaded");
} catch (err) {
const errorMessage = formatErrorMessage(err);
throw new MediaOffloadError(
`[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
{ cause: err },
);
}
continue;
}
if (
sniffedMime &&
providedMime &&
!isGenericMime(providedMime) &&
sniffedMime !== providedMime
) {
if (sniffedMime && providedMime && sniffedMime !== providedMime) {
const usedSource =
finalMime === sniffedMime
? "sniffed"
: finalMime === providedMime
? "provided"
: "label-derived";
log?.warn(
`attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`,
`attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using ${usedSource}`,
);
}
// Third fallback normalises `mime` so a raw un-normalised string (e.g.
// "IMAGE/JPEG") does not silently bypass the SUPPORTED_OFFLOAD_MIMES check.
const finalMime = sniffedMime ?? providedMime ?? normalizeMime(mime) ?? mime;
const isImage = isImageMime(finalMime);
if (isImage && !supportsInlineImages && !shouldForceImageOffload) {
throw new UnsupportedAttachmentError(
"text-only-image",
`attachment ${label}: active model does not accept image inputs`,
);
}
if (!isImage && !acceptNonImage) {
throw new UnsupportedAttachmentError(
"unsupported-non-image",
`attachment ${label}: non-image attachments (${finalMime}) are not supported on this entrypoint`,
);
}
if (isImage && sizeBytes > MAX_IMAGE_BYTES) {
throw new Error(
`attachment ${label}: image exceeds size limit (${sizeBytes} > ${MAX_IMAGE_BYTES} bytes)`,
);
}
let isOffloaded = false;
if (shouldForceOffload && textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT) {
if (shouldForceImageOffload && isImage && textOnlyImageOffloadCount >= TEXT_ONLY_OFFLOAD_LIMIT) {
log?.warn(
`attachment ${label}: dropping image because text-only offload limit ` +
`${TEXT_ONLY_OFFLOAD_LIMIT} was reached`,
@@ -426,93 +306,63 @@ export async function parseMessageWithAttachments(
continue;
}
if (shouldForceOffload || sizeBytes > OFFLOAD_THRESHOLD_BYTES) {
const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime);
const shouldOffload = shouldForceImageOffload || !isImage || sizeBytes > OFFLOAD_THRESHOLD_BYTES;
if (!isSupportedForOffload) {
if (shouldForceOffload) {
log?.warn(
`attachment ${label}: format ${finalMime} cannot be offloaded for ` +
"text-only model, dropping",
);
continue;
}
// Passing this inline would reintroduce the OOM risk this PR prevents.
throw new Error(
`attachment ${label}: format ${finalMime} is too large to pass inline ` +
`(${sizeBytes} > ${OFFLOAD_THRESHOLD_BYTES} bytes) and cannot be offloaded. ` +
`Please convert to JPEG, PNG, WEBP, GIF, HEIC, or HEIF.`,
);
}
// Decode and run input-validation BEFORE the MediaOffloadError try/catch.
// verifyDecodedSize is a 4xx client error and must not be wrapped as a
// 5xx MediaOffloadError.
const buffer = Buffer.from(b64, "base64");
verifyDecodedSize(buffer, sizeBytes, label);
// Only the storage operation is wrapped so callers can distinguish
// infrastructure failures (5xx) from input errors (4xx).
try {
const labelWithExt = ensureExtension(label, finalMime);
const rawResult = await saveMediaBuffer(
buffer,
finalMime,
"inbound",
maxBytes,
labelWithExt,
);
const savedMedia = assertSavedMedia(rawResult, label);
// Track for cleanup if a subsequent attachment fails.
savedMediaIds.push(savedMedia.id);
// Opaque URI — compatible with workspaceOnly sandboxes and decouples
// the Gateway from the agent's filesystem layout.
const mediaRef = `media://inbound/${savedMedia.id}`;
updatedMessage += `\n[media attached: ${mediaRef}]`;
log?.info?.(
shouldForceOffload
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
: `[Gateway] Intercepted large image payload. Saved: ${mediaRef}`,
);
// Record for transcript metadata — separate from `images` because
// these are not passed inline to the model.
offloadedRefs.push({
mediaRef,
id: savedMedia.id,
path: savedMedia.path ?? "",
mimeType: finalMime,
label,
});
imageOrder.push("offloaded");
if (shouldForceOffload) {
textOnlyImageOffloadCount++;
}
isOffloaded = true;
} catch (err) {
const errorMessage = formatErrorMessage(err);
throw new MediaOffloadError(
`[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`,
{ cause: err },
);
}
}
if (isOffloaded) {
if (!shouldOffload) {
images.push({ type: "image", data: b64, mimeType: finalMime });
imageOrder.push("inline");
continue;
}
images.push({ type: "image", data: b64, mimeType: finalMime });
imageOrder.push("inline");
const buffer = Buffer.from(b64, "base64");
verifyDecodedSize(buffer, sizeBytes, label);
let savedMedia: SavedMedia;
try {
const labelWithExt = ensureExtension(label, finalMime);
const rawResult = await saveMediaBuffer(
buffer,
finalMime,
"inbound",
maxBytes,
labelWithExt,
);
savedMedia = assertSavedMedia(rawResult, label);
} catch (err) {
throw new MediaOffloadError(
`[Gateway Error] Failed to save intercepted media to disk: ${formatErrorMessage(err)}`,
{ cause: err },
);
}
savedMediaIds.push(savedMedia.id);
const mediaRef = `media://inbound/${savedMedia.id}`;
if (isImage) {
updatedMessage += `\n[media attached: ${mediaRef}]`;
}
log?.info?.(
shouldForceImageOffload && isImage
? `[Gateway] Offloaded image for text-only model. Saved: ${mediaRef}`
: `[Gateway] Offloaded attachment (${finalMime}). Saved: ${mediaRef}`,
);
offloadedRefs.push({
mediaRef,
id: savedMedia.id,
path: savedMedia.path,
mimeType: finalMime,
label,
sizeBytes,
});
if (isImage) {
imageOrder.push("offloaded");
if (shouldForceImageOffload) {
textOnlyImageOffloadCount++;
}
}
}
} catch (err) {
// Best-effort cleanup before rethrowing.
if (savedMediaIds.length > 0) {
await Promise.allSettled(savedMediaIds.map((id) => deleteMediaBuffer(id, "inbound")));
}

View File

@@ -498,7 +498,7 @@ export const agentHandlers: GatewayRequestHandlers = {
}
const effectiveProvider = providerOverride || baseProvider;
const effectiveModel = modelOverride || baseModel;
const supportsImages = await resolveGatewayModelSupportsImages({
const supportsInlineImages = await resolveGatewayModelSupportsImages({
loadGatewayModelCatalog: context.loadGatewayModelCatalog,
provider: effectiveProvider,
model: effectiveModel,
@@ -508,7 +508,11 @@ export const agentHandlers: GatewayRequestHandlers = {
const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
maxBytes: 5_000_000,
log: context.logGateway,
supportsImages,
supportsInlineImages,
// agent.run does not yet wire a ctx.MediaPaths stage path, so reject
// non-image attachments explicitly (UnsupportedAttachmentError)
// instead of saving them where the agent cannot reach them.
acceptNonImage: false,
});
message = parsed.message.trim();
images = parsed.images;

View File

@@ -2335,6 +2335,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
],
},
expectBroadcast: false,
waitFor: "none",
});
expect(mockState.lastDispatchImages).toBeUndefined();
@@ -2449,6 +2450,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
],
},
expectBroadcast: false,
waitFor: "none",
});
expect(mockState.lastDispatchImages).toBeUndefined();
@@ -2461,6 +2463,65 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
]);
});
it("routes non-image offloaded refs into ctx.MediaPaths + MediaTypes for chat.send", async () => {
createTranscriptFixture("openclaw-chat-send-non-image-ctx-media-paths-");
mockState.finalText = "ok";
mockState.sessionEntry = {
modelProvider: "test-provider",
model: "vision-model",
};
mockState.modelCatalog = [
{
provider: "test-provider",
id: "vision-model",
name: "Vision model",
input: ["text", "image"],
},
];
mockState.savedMediaResults = [
{ path: "/home/user/.openclaw/media/inbound/report.pdf", contentType: "application/pdf" },
];
const respond = vi.fn();
const context = createChatContext();
const pdf = Buffer.from("%PDF-1.4\n%µ¶\n1 0 obj\n<<>>\nendobj\n").toString("base64");
await runNonStreamingChatSend({
context,
respond,
idempotencyKey: "idem-non-image-ctx-media",
message: "read this",
requestParams: {
attachments: [
{
type: "file",
mimeType: "application/pdf",
fileName: "report.pdf",
content: pdf,
},
],
},
expectBroadcast: false,
});
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual([
"/home/user/.openclaw/media/inbound/report.pdf",
]);
expect(mockState.lastDispatchCtx?.MediaPath).toBe(
"/home/user/.openclaw/media/inbound/report.pdf",
);
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["application/pdf"]);
expect(mockState.lastDispatchCtx?.MediaType).toBe("application/pdf");
// Non-image offloads MUST NOT inject a media://URI into the prompt body —
// they ride through ctx.MediaPaths so buildInboundMediaNote prepends the
// real path, avoiding duplicate media markers.
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
expect(mockState.lastDispatchCtx?.BodyForAgent).not.toContain("media://");
expect(mockState.lastDispatchImages).toBeUndefined();
// Marker replaces the implicit "relative-path no-op" coupling in
// get-reply.ts with an explicit skip contract.
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
});
it("passes imageOrder for mixed inline and offloaded chat.send attachments", async () => {
createTranscriptFixture("openclaw-chat-send-image-order-");
mockState.finalText = "ok";

View File

@@ -2,16 +2,19 @@ import fs from "node:fs";
import path from "node:path";
import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
import { resolveAgentWorkspaceDir, resolveSessionAgentId } from "../../agents/agent-scope.js";
import { resolveThinkingDefault } from "../../agents/model-selection.js";
import { rewriteTranscriptEntriesInSessionFile } from "../../agents/pi-embedded-runner/transcript-rewrite.js";
import { ensureSandboxWorkspaceForSession } from "../../agents/sandbox/context.js";
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
import { dispatchInboundMessage } from "../../auto-reply/dispatch.js";
import type { ReplyPayload } from "../../auto-reply/reply-payload.js";
import { createReplyDispatcher } from "../../auto-reply/reply/reply-dispatcher.js";
import type { MsgContext } from "../../auto-reply/templating.js";
import { stageSandboxMedia } from "../../auto-reply/reply/stage-sandbox-media.js";
import type { MsgContext, TemplateContext } from "../../auto-reply/templating.js";
import { extractCanvasFromText } from "../../chat/canvas-render.js";
import { resolveSessionFilePath } from "../../config/sessions.js";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { jsonUtf8Bytes } from "../../infra/json-utf8-bytes.js";
import { normalizeReplyPayloadsForDelivery } from "../../infra/outbound/payloads.js";
import { getSessionBindingService } from "../../infra/outbound/session-binding-service.js";
@@ -22,7 +25,7 @@ import {
} from "../../media/local-roots.js";
import { isAudioFileName } from "../../media/mime.js";
import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js";
import { type SavedMedia, saveMediaBuffer } from "../../media/store.js";
import { deleteMediaBuffer, type SavedMedia, saveMediaBuffer } from "../../media/store.js";
import { createChannelReplyPipeline } from "../../plugin-sdk/channel-reply-pipeline.js";
import { isPluginOwnedSessionBindingRecord } from "../../plugins/conversation-binding.js";
import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js";
@@ -50,6 +53,7 @@ import {
type ChatImageContent,
type OffloadedRef,
parseMessageWithAttachments,
resolveChatAttachmentMaxBytes,
} from "../chat-attachments.js";
import { MediaOffloadError } from "../chat-attachments.js";
import {
@@ -766,6 +770,64 @@ function buildChatSendTranscriptMessage(params: {
};
}
// Stages non-image offloads into the agent sandbox synchronously so chat.send
// can surface 5xx before respond(). Throws MediaOffloadError on partial-stage
// instead of silently losing files (channel path is best-effort; chat.send is
// strong-delivery RPC). Callers MUST set ctx.MediaStaged=true when this runs
// so the dispatch pipeline skips its own stageSandboxMedia pass.
async function prestageNonImageOffloads(params: {
offloadedRefs: OffloadedRef[];
cfg: OpenClawConfig;
sessionKey: string;
agentId: string;
}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> {
const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/"));
if (nonImage.length === 0) {
return { paths: [], types: [] };
}
const workspaceDir = resolveAgentWorkspaceDir(params.cfg, params.agentId);
const sandbox = await ensureSandboxWorkspaceForSession({
config: params.cfg,
sessionKey: params.sessionKey,
workspaceDir,
});
if (!sandbox) {
return {
paths: nonImage.map((ref) => ref.path),
types: nonImage.map((ref) => ref.mimeType),
};
}
const stagingCtx: MsgContext = {
MediaPath: nonImage[0].path,
MediaPaths: nonImage.map((ref) => ref.path),
MediaType: nonImage[0].mimeType,
MediaTypes: nonImage.map((ref) => ref.mimeType),
};
await stageSandboxMedia({
ctx: stagingCtx,
sessionCtx: stagingCtx as TemplateContext,
cfg: params.cfg,
sessionKey: params.sessionKey,
workspaceDir,
});
const stagedPaths = stagingCtx.MediaPaths ?? [];
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
const allRewritten =
stagedPaths.length === nonImage.length && stagedPaths.every((p) => !path.isAbsolute(p));
if (!allRewritten) {
await Promise.allSettled(
params.offloadedRefs.map((ref) => deleteMediaBuffer(ref.id, "inbound")),
);
throw new MediaOffloadError(
`non-image attachment staging incomplete: ${stagedPaths.length}/${nonImage.length} paths rewritten into sandbox workspace`,
);
}
return { paths: stagedPaths, types: stagedTypes, workspaceDir: sandbox.workspaceDir };
}
function resolveChatSendTranscriptMediaFields(savedImages: SavedMedia[]) {
const mediaPaths = savedImages.map((entry) => entry.path);
if (mediaPaths.length === 0) {
@@ -1759,6 +1821,9 @@ export const chatHandlers: GatewayRequestHandlers = {
let parsedImages: ChatImageContent[] = [];
let imageOrder: PromptImageOrderEntry[] = [];
let offloadedRefs: OffloadedRef[] = [];
let nonImageMediaPaths: string[] = [];
let nonImageMediaTypes: string[] = [];
let nonImageMediaWorkspaceDir: string | undefined;
const timeoutMs = resolveAgentTimeoutMs({
cfg,
overrideMs: p.timeoutMs,
@@ -1833,14 +1898,27 @@ export const chatHandlers: GatewayRequestHandlers = {
explicitOriginTargetsPlugin;
try {
const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, {
maxBytes: 5_000_000,
maxBytes: resolveChatAttachmentMaxBytes(cfg),
log: context.logGateway,
supportsImages,
// chat.send routes non-image offloadedRefs into ctx.MediaPaths below
// so the auto-reply stage pipeline can surface them to the agent.
acceptNonImage: true,
});
parsedMessage = parsed.message;
parsedImages = parsed.images;
imageOrder = parsed.imageOrder;
offloadedRefs = parsed.offloadedRefs;
({
paths: nonImageMediaPaths,
types: nonImageMediaTypes,
workspaceDir: nonImageMediaWorkspaceDir,
} = await prestageNonImageOffloads({
offloadedRefs,
cfg,
sessionKey,
agentId,
}));
} catch (err) {
respond(
false,
@@ -1947,6 +2025,19 @@ export const chatHandlers: GatewayRequestHandlers = {
GatewayClientScopes: client?.connect?.scopes ?? [],
...pluginBoundMediaFields,
};
if (nonImageMediaPaths.length > 0) {
// Inject non-image offloads via the same MsgContext fields the channel
// path uses so buildInboundMediaNote renders a real `[media attached:
// <workspace-relative-path>]` line into the agent prompt. Marker
// blocks the dispatch pipeline from re-running stageSandboxMedia; see
// prestageNonImageOffloads.
ctx.MediaPath = nonImageMediaPaths[0];
ctx.MediaPaths = nonImageMediaPaths;
ctx.MediaType = nonImageMediaTypes[0];
ctx.MediaTypes = nonImageMediaTypes;
ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir;
ctx.MediaStaged = true;
}
const { onModelSelected, ...replyPipeline } = createChannelReplyPipeline({
cfg,

View File

@@ -923,7 +923,7 @@ describe("agent request events", () => {
expect(opts.runId).toBe(opts.sessionId);
});
it("passes supportsImages false for text-only node-session models", async () => {
it("passes supportsInlineImages false for text-only node-session models", async () => {
const ctx = buildCtx();
ctx.loadGatewayModelCatalog = async () => [
{
@@ -960,7 +960,41 @@ describe("agent request events", () => {
expect(parseMessageWithAttachmentsMock).toHaveBeenCalledWith(
"describe",
expect.any(Array),
expect.objectContaining({ supportsImages: false }),
expect.objectContaining({ supportsInlineImages: false }),
);
});
it("declines non-image attachments cleanly when parse throws UnsupportedAttachmentError", async () => {
const warn = vi.fn();
const ctx = buildCtx();
ctx.logGateway = { warn };
parseMessageWithAttachmentsMock.mockRejectedValueOnce(
Object.assign(new Error("attachment a.pdf: non-image attachments not supported"), {
name: "UnsupportedAttachmentError",
reason: "unsupported-non-image",
}),
);
await handleNodeEvent(ctx, "node-non-image-refusal", {
event: "agent.request",
payloadJSON: JSON.stringify({
message: "read this",
sessionKey: "agent:main:main",
attachments: [
{
type: "file",
mimeType: "application/pdf",
fileName: "a.pdf",
content: "JVBERi0=",
},
],
}),
});
// server-node-events must log-and-return on parse failure — no agent
// dispatch, no crash, and the refusal reason bubbles up via logGateway.
expect(agentCommandMock).not.toHaveBeenCalled();
expect(warn).toHaveBeenCalledWith(expect.stringMatching(/attachment parse failed.*non-image/i));
});
});

View File

@@ -427,7 +427,7 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt
if (normalizedAttachments.length > 0) {
const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg });
const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId);
const supportsImages = await resolveGatewayModelSupportsImages({
const supportsInlineImages = await resolveGatewayModelSupportsImages({
loadGatewayModelCatalog: ctx.loadGatewayModelCatalog,
provider: modelRef.provider,
model: modelRef.model,
@@ -436,7 +436,11 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt
const parsed = await parseMessageWithAttachments(message, normalizedAttachments, {
maxBytes: 5_000_000,
log: ctx.logGateway,
supportsImages,
supportsInlineImages,
// server-node-events dispatches via agentCommandFromIngress which
// has no ctx.MediaPaths wiring; reject non-image attachments
// explicitly rather than saving them where the agent cannot reach them.
acceptNonImage: false,
});
message = parsed.message.trim();
images = parsed.images;