fix(gateway): route text-only chat images to media understanding

This commit is contained in:
Peter Steinberger
2026-04-28 06:45:20 +01:00
parent 0fc1cdec45
commit 0113248d91
3 changed files with 86 additions and 50 deletions

View File

@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Cron/Telegram: preserve explicit `:topic:` delivery targets over stale session-derived thread IDs when isolated cron announces to Telegram forum topics. Carries forward #59069; refs #49704 and #43808. Thanks @roytong9.
- Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah.
- CLI/onboarding: infer image input for common custom-provider vision model IDs, ask only for unknown models, and keep `--custom-image-input`/`--custom-text-input` overrides so vision-capable proxies do not get saved as text-only configs. Fixes #51869. Thanks @Antsoldier1974.
- Models/OpenAI Codex: stop listing or resolving unsupported `openai-codex/gpt-5.4-mini` rows through Codex OAuth, keep stale discovery rows suppressed with a clear API-key-route hint, and leave direct `openai/gpt-5.4-mini` available. Fixes #73242. Thanks @0xCyda.
- Memory/Dreaming: retry Dream Diary once with the session default when a configured dreaming model is unavailable, while leaving subagent trust and allowlist errors visible instead of silently masking configuration problems. Refs #67409 and #69209. Thanks @Ghiggins18 and @everySympathy.

View File

@@ -2357,7 +2357,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
});
it("offloads image attachments for text-only session models", async () => {
it("routes text-only image offloads into media-understanding fields", async () => {
createTranscriptFixture("openclaw-chat-send-text-only-attachments-");
mockState.finalText = "ok";
mockState.sessionEntry = {
@@ -2394,10 +2394,14 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
expect(mockState.lastDispatchImages).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
expect(mockState.lastDispatchCtx?.Body).toMatch(
/^describe image\n\[media attached: media:\/\/inbound\//,
);
expect(mockState.lastDispatchImageOrder).toBeUndefined();
expect(mockState.lastDispatchCtx?.Body).toBe("describe image");
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
expect(mockState.lastDispatchCtx?.MediaPath).toBe("/tmp/1.png");
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["/tmp/1.png"]);
expect(mockState.lastDispatchCtx?.MediaType).toBe("image/png");
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["image/png"]);
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
expect(mockState.savedMediaCalls).toEqual([
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
]);
@@ -2557,10 +2561,14 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
expect(mockState.lastDispatchImages).toBeUndefined();
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
expect(mockState.lastDispatchCtx?.Body).toMatch(
/^describe image\n\[media attached: media:\/\/inbound\//,
);
expect(mockState.lastDispatchImageOrder).toBeUndefined();
expect(mockState.lastDispatchCtx?.Body).toBe("describe image");
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
expect(mockState.lastDispatchCtx?.MediaPath).toBe("/tmp/1.png");
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["/tmp/1.png"]);
expect(mockState.lastDispatchCtx?.MediaType).toBe("image/png");
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["image/png"]);
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
expect(mockState.savedMediaCalls).toEqual([
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
]);

View File

@@ -794,7 +794,24 @@ function buildChatSendTranscriptMessage(params: {
};
}
// Stages non-image offloads into the agent sandbox synchronously so chat.send
function stripTrailingOffloadedMediaMarkers(message: string, refs: OffloadedRef[]): string {
if (refs.length === 0) {
return message;
}
const removableRefs = new Set(refs.map((ref) => ref.mediaRef));
const lines = message.split(/\r?\n/);
while (lines.length > 0) {
const last = lines[lines.length - 1]?.trim() ?? "";
const match = /^\[media attached:\s*(media:\/\/inbound\/[^\]\s]+)\]$/.exec(last);
if (!match?.[1] || !removableRefs.delete(match[1])) {
break;
}
lines.pop();
}
return lines.join("\n").trimEnd();
}
// Stages media-path offloads into the agent sandbox synchronously so chat.send
// can surface 5xx before respond(). Throws MediaOffloadError on any staging
// failure (ENOSPC / EPERM / partial-stage) so the outer chat.send handler can
// map it to UNAVAILABLE (5xx); plain Error would be misclassified as 4xx. All
@@ -802,20 +819,20 @@ function buildChatSendTranscriptMessage(params: {
// Callers MUST set ctx.MediaStaged=true when this runs so the dispatch
// pipeline skips its own stageSandboxMedia pass.
//
// Returned paths are ABSOLUTE (pointing into the sandbox workspace when sandbox
// is enabled, or the media-store origin when it is not). applyMediaUnderstanding
// runs before any further staging in get-reply.ts and uses
// `path.isAbsolute(raw) ? raw : path.resolve(raw)` against the gateway CWD, so
// any relative path here would make media-understanding target the wrong host
// path and silently skip file analysis.
async function prestageNonImageOffloads(params: {
// Returned paths are absolute media-store paths when no sandbox is active, or
// sandbox-relative paths plus `workspaceDir` when sandboxing is active. Host-side
// media-understanding uses MediaWorkspaceDir to resolve those relative paths.
async function prestageMediaPathOffloads(params: {
offloadedRefs: OffloadedRef[];
includeImageRefs?: boolean;
cfg: OpenClawConfig;
sessionKey: string;
agentId: string;
}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> {
const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/"));
if (nonImage.length === 0) {
const mediaPathRefs = params.offloadedRefs.filter(
(ref) => params.includeImageRefs || !ref.mimeType.startsWith("image/"),
);
if (mediaPathRefs.length === 0) {
return { paths: [], types: [] };
}
@@ -828,33 +845,33 @@ async function prestageNonImageOffloads(params: {
});
if (!sandbox) {
return {
paths: nonImage.map((ref) => ref.path),
types: nonImage.map((ref) => ref.mimeType),
paths: mediaPathRefs.map((ref) => ref.path),
types: mediaPathRefs.map((ref) => ref.mimeType),
};
}
// stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (=
// MEDIA_MAX_BYTES, 5MB) and silently skips oversized files. The parse cap
// (resolveChatAttachmentMaxBytes, default 20MB) is higher, so a sandboxed
// session receiving a non-image file between the two caps would otherwise
// session receiving a file between the two caps would otherwise
// pass parse, fail staging, and surface as a retryable 5xx even though
// retry cannot succeed. Reject here as a client-side 4xx instead.
const oversizedForSandbox = nonImage.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
const oversizedForSandbox = mediaPathRefs.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
if (oversizedForSandbox.length > 0) {
const details = oversizedForSandbox
.map((ref) => `${ref.label} (${ref.sizeBytes} bytes)`)
.join(", ");
throw new UnsupportedAttachmentError(
"non-image-too-large-for-sandbox",
`non-image attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
`attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
);
}
const stagingCtx: MsgContext = {
MediaPath: nonImage[0].path,
MediaPaths: nonImage.map((ref) => ref.path),
MediaType: nonImage[0].mimeType,
MediaTypes: nonImage.map((ref) => ref.mimeType),
MediaPath: mediaPathRefs[0].path,
MediaPaths: mediaPathRefs.map((ref) => ref.path),
MediaType: mediaPathRefs[0].mimeType,
MediaTypes: mediaPathRefs.map((ref) => ref.mimeType),
};
const stageResult = await stageSandboxMedia({
ctx: stagingCtx,
@@ -871,14 +888,14 @@ async function prestageNonImageOffloads(params: {
// (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
// missing source becomes a 5xx MediaOffloadError the client can retry.
const stagedSources = stageResult.staged;
const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
const missing = mediaPathRefs.filter((ref) => !stagedSources.has(ref.path));
if (missing.length > 0) {
throw new Error(
`non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
`attachment staging incomplete: ${stagedSources.size}/${mediaPathRefs.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
);
}
const stagedPaths = stagingCtx.MediaPaths ?? [];
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
const stagedTypes = stagingCtx.MediaTypes ?? mediaPathRefs.map((ref) => ref.mimeType);
// Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
// agent inside the container can read them. Host-side media-understanding
@@ -897,7 +914,7 @@ async function prestageNonImageOffloads(params: {
throw err;
}
throw new MediaOffloadError(
`[Gateway Error] Failed to stage non-image attachments into agent workspace: ${formatErrorMessage(err)}`,
`[Gateway Error] Failed to stage attachments into agent workspace: ${formatErrorMessage(err)}`,
{ cause: err },
);
}
@@ -1896,9 +1913,9 @@ export const chatHandlers: GatewayRequestHandlers = {
let parsedImages: ChatImageContent[] = [];
let imageOrder: PromptImageOrderEntry[] = [];
let offloadedRefs: OffloadedRef[] = [];
let nonImageMediaPaths: string[] = [];
let nonImageMediaTypes: string[] = [];
let nonImageMediaWorkspaceDir: string | undefined;
let mediaPathOffloadPaths: string[] = [];
let mediaPathOffloadTypes: string[] = [];
let mediaPathOffloadWorkspaceDir: string | undefined;
const timeoutMs = resolveAgentTimeoutMs({
cfg,
overrideMs: p.timeoutMs,
@@ -1971,25 +1988,35 @@ export const chatHandlers: GatewayRequestHandlers = {
supportsSessionModelImages ||
explicitOriginTargetsAcpSession(explicitOriginResult.value) ||
explicitOriginTargetsPlugin;
const routeImageOffloadsAsMediaPaths = !supportsImages;
try {
const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, {
maxBytes: resolveChatAttachmentMaxBytes(cfg),
log: context.logGateway,
supportsImages,
// chat.send routes non-image offloadedRefs into ctx.MediaPaths below
// chat.send routes selected offloadedRefs into ctx.MediaPaths below
// so the auto-reply stage pipeline can surface them to the agent.
acceptNonImage: true,
});
parsedMessage = parsed.message;
parsedMessage = stripTrailingOffloadedMediaMarkers(
parsed.message,
routeImageOffloadsAsMediaPaths
? parsed.offloadedRefs.filter((ref) => ref.mimeType.startsWith("image/"))
: [],
);
parsedImages = parsed.images;
imageOrder = parsed.imageOrder;
imageOrder = routeImageOffloadsAsMediaPaths ? [] : parsed.imageOrder;
offloadedRefs = parsed.offloadedRefs;
({
paths: nonImageMediaPaths,
types: nonImageMediaTypes,
workspaceDir: nonImageMediaWorkspaceDir,
} = await prestageNonImageOffloads({
paths: mediaPathOffloadPaths,
types: mediaPathOffloadTypes,
workspaceDir: mediaPathOffloadWorkspaceDir,
} = await prestageMediaPathOffloads({
offloadedRefs,
// Text-only image offloads need ctx.MediaPaths so media-understanding
// can describe them via agents.defaults.imageModel. Vision-capable
// image offloads stay as prompt refs for native image loading.
includeImageRefs: routeImageOffloadsAsMediaPaths,
cfg,
sessionKey,
agentId,
@@ -2100,17 +2127,17 @@ export const chatHandlers: GatewayRequestHandlers = {
GatewayClientScopes: client?.connect?.scopes ?? [],
...pluginBoundMediaFields,
};
if (nonImageMediaPaths.length > 0) {
// Inject non-image offloads via the same MsgContext fields the channel
if (mediaPathOffloadPaths.length > 0) {
// Inject offloads via the same MsgContext fields the channel
// path uses so buildInboundMediaNote renders a real `[media attached:
// <workspace-relative-path>]` line into the agent prompt. Marker
// blocks the dispatch pipeline from re-running stageSandboxMedia; see
// prestageNonImageOffloads.
ctx.MediaPath = nonImageMediaPaths[0];
ctx.MediaPaths = nonImageMediaPaths;
ctx.MediaType = nonImageMediaTypes[0];
ctx.MediaTypes = nonImageMediaTypes;
ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir;
// prestageMediaPathOffloads.
ctx.MediaPath = mediaPathOffloadPaths[0];
ctx.MediaPaths = mediaPathOffloadPaths;
ctx.MediaType = mediaPathOffloadTypes[0];
ctx.MediaTypes = mediaPathOffloadTypes;
ctx.MediaWorkspaceDir = mediaPathOffloadWorkspaceDir;
ctx.MediaStaged = true;
}