mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:40:44 +00:00
fix(gateway): route text-only chat images to media understanding
This commit is contained in:
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- Cron/Telegram: preserve explicit `:topic:` delivery targets over stale session-derived thread IDs when isolated cron announces to Telegram forum topics. Carries forward #59069; refs #49704 and #43808. Thanks @roytong9.
|
||||
- Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah.
|
||||
- CLI/onboarding: infer image input for common custom-provider vision model IDs, ask only for unknown models, and keep `--custom-image-input`/`--custom-text-input` overrides so vision-capable proxies do not get saved as text-only configs. Fixes #51869. Thanks @Antsoldier1974.
|
||||
- Models/OpenAI Codex: stop listing or resolving unsupported `openai-codex/gpt-5.4-mini` rows through Codex OAuth, keep stale discovery rows suppressed with a clear API-key-route hint, and leave direct `openai/gpt-5.4-mini` available. Fixes #73242. Thanks @0xCyda.
|
||||
- Memory/Dreaming: retry Dream Diary once with the session default when a configured dreaming model is unavailable, while leaving subagent trust and allowlist errors visible instead of silently masking configuration problems. Refs #67409 and #69209. Thanks @Ghiggins18 and @everySympathy.
|
||||
|
||||
@@ -2357,7 +2357,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
expect(JSON.stringify(transcriptUpdate)).not.toContain("[[audio_as_voice]]");
|
||||
});
|
||||
|
||||
it("offloads image attachments for text-only session models", async () => {
|
||||
it("routes text-only image offloads into media-understanding fields", async () => {
|
||||
createTranscriptFixture("openclaw-chat-send-text-only-attachments-");
|
||||
mockState.finalText = "ok";
|
||||
mockState.sessionEntry = {
|
||||
@@ -2394,10 +2394,14 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
|
||||
expect(mockState.lastDispatchCtx?.Body).toMatch(
|
||||
/^describe image\n\[media attached: media:\/\/inbound\//,
|
||||
);
|
||||
expect(mockState.lastDispatchImageOrder).toBeUndefined();
|
||||
expect(mockState.lastDispatchCtx?.Body).toBe("describe image");
|
||||
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
|
||||
expect(mockState.lastDispatchCtx?.MediaPath).toBe("/tmp/1.png");
|
||||
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["/tmp/1.png"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaType).toBe("image/png");
|
||||
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["image/png"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
|
||||
expect(mockState.savedMediaCalls).toEqual([
|
||||
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
|
||||
]);
|
||||
@@ -2557,10 +2561,14 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
|
||||
expect(mockState.lastDispatchImages).toBeUndefined();
|
||||
expect(mockState.lastDispatchImageOrder).toEqual(["offloaded"]);
|
||||
expect(mockState.lastDispatchCtx?.Body).toMatch(
|
||||
/^describe image\n\[media attached: media:\/\/inbound\//,
|
||||
);
|
||||
expect(mockState.lastDispatchImageOrder).toBeUndefined();
|
||||
expect(mockState.lastDispatchCtx?.Body).toBe("describe image");
|
||||
expect(mockState.lastDispatchCtx?.Body).not.toContain("media://");
|
||||
expect(mockState.lastDispatchCtx?.MediaPath).toBe("/tmp/1.png");
|
||||
expect(mockState.lastDispatchCtx?.MediaPaths).toEqual(["/tmp/1.png"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaType).toBe("image/png");
|
||||
expect(mockState.lastDispatchCtx?.MediaTypes).toEqual(["image/png"]);
|
||||
expect(mockState.lastDispatchCtx?.MediaStaged).toBe(true);
|
||||
expect(mockState.savedMediaCalls).toEqual([
|
||||
expect.objectContaining({ contentType: "image/png", subdir: "inbound" }),
|
||||
]);
|
||||
|
||||
@@ -794,7 +794,24 @@ function buildChatSendTranscriptMessage(params: {
|
||||
};
|
||||
}
|
||||
|
||||
// Stages non-image offloads into the agent sandbox synchronously so chat.send
|
||||
function stripTrailingOffloadedMediaMarkers(message: string, refs: OffloadedRef[]): string {
|
||||
if (refs.length === 0) {
|
||||
return message;
|
||||
}
|
||||
const removableRefs = new Set(refs.map((ref) => ref.mediaRef));
|
||||
const lines = message.split(/\r?\n/);
|
||||
while (lines.length > 0) {
|
||||
const last = lines[lines.length - 1]?.trim() ?? "";
|
||||
const match = /^\[media attached:\s*(media:\/\/inbound\/[^\]\s]+)\]$/.exec(last);
|
||||
if (!match?.[1] || !removableRefs.delete(match[1])) {
|
||||
break;
|
||||
}
|
||||
lines.pop();
|
||||
}
|
||||
return lines.join("\n").trimEnd();
|
||||
}
|
||||
|
||||
// Stages media-path offloads into the agent sandbox synchronously so chat.send
|
||||
// can surface 5xx before respond(). Throws MediaOffloadError on any staging
|
||||
// failure (ENOSPC / EPERM / partial-stage) so the outer chat.send handler can
|
||||
// map it to UNAVAILABLE (5xx); plain Error would be misclassified as 4xx. All
|
||||
@@ -802,20 +819,20 @@ function buildChatSendTranscriptMessage(params: {
|
||||
// Callers MUST set ctx.MediaStaged=true when this runs so the dispatch
|
||||
// pipeline skips its own stageSandboxMedia pass.
|
||||
//
|
||||
// Returned paths are ABSOLUTE (pointing into the sandbox workspace when sandbox
|
||||
// is enabled, or the media-store origin when it is not). applyMediaUnderstanding
|
||||
// runs before any further staging in get-reply.ts and uses
|
||||
// `path.isAbsolute(raw) ? raw : path.resolve(raw)` against the gateway CWD, so
|
||||
// any relative path here would make media-understanding target the wrong host
|
||||
// path and silently skip file analysis.
|
||||
async function prestageNonImageOffloads(params: {
|
||||
// Returned paths are absolute media-store paths when no sandbox is active, or
|
||||
// sandbox-relative paths plus `workspaceDir` when sandboxing is active. Host-side
|
||||
// media-understanding uses MediaWorkspaceDir to resolve those relative paths.
|
||||
async function prestageMediaPathOffloads(params: {
|
||||
offloadedRefs: OffloadedRef[];
|
||||
includeImageRefs?: boolean;
|
||||
cfg: OpenClawConfig;
|
||||
sessionKey: string;
|
||||
agentId: string;
|
||||
}): Promise<{ paths: string[]; types: string[]; workspaceDir?: string }> {
|
||||
const nonImage = params.offloadedRefs.filter((ref) => !ref.mimeType.startsWith("image/"));
|
||||
if (nonImage.length === 0) {
|
||||
const mediaPathRefs = params.offloadedRefs.filter(
|
||||
(ref) => params.includeImageRefs || !ref.mimeType.startsWith("image/"),
|
||||
);
|
||||
if (mediaPathRefs.length === 0) {
|
||||
return { paths: [], types: [] };
|
||||
}
|
||||
|
||||
@@ -828,33 +845,33 @@ async function prestageNonImageOffloads(params: {
|
||||
});
|
||||
if (!sandbox) {
|
||||
return {
|
||||
paths: nonImage.map((ref) => ref.path),
|
||||
types: nonImage.map((ref) => ref.mimeType),
|
||||
paths: mediaPathRefs.map((ref) => ref.path),
|
||||
types: mediaPathRefs.map((ref) => ref.mimeType),
|
||||
};
|
||||
}
|
||||
|
||||
// stageSandboxMedia caps each file at STAGED_MEDIA_MAX_BYTES (=
|
||||
// MEDIA_MAX_BYTES, 5MB) and silently skips oversized files. The parse cap
|
||||
// (resolveChatAttachmentMaxBytes, default 20MB) is higher, so a sandboxed
|
||||
// session receiving a non-image file between the two caps would otherwise
|
||||
// session receiving a file between the two caps would otherwise
|
||||
// pass parse, fail staging, and surface as a retryable 5xx even though
|
||||
// retry cannot succeed. Reject here as a client-side 4xx instead.
|
||||
const oversizedForSandbox = nonImage.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
|
||||
const oversizedForSandbox = mediaPathRefs.filter((ref) => ref.sizeBytes > MEDIA_MAX_BYTES);
|
||||
if (oversizedForSandbox.length > 0) {
|
||||
const details = oversizedForSandbox
|
||||
.map((ref) => `${ref.label} (${ref.sizeBytes} bytes)`)
|
||||
.join(", ");
|
||||
throw new UnsupportedAttachmentError(
|
||||
"non-image-too-large-for-sandbox",
|
||||
`non-image attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
|
||||
`attachments exceed sandbox staging limit (${MEDIA_MAX_BYTES} bytes): ${details}`,
|
||||
);
|
||||
}
|
||||
|
||||
const stagingCtx: MsgContext = {
|
||||
MediaPath: nonImage[0].path,
|
||||
MediaPaths: nonImage.map((ref) => ref.path),
|
||||
MediaType: nonImage[0].mimeType,
|
||||
MediaTypes: nonImage.map((ref) => ref.mimeType),
|
||||
MediaPath: mediaPathRefs[0].path,
|
||||
MediaPaths: mediaPathRefs.map((ref) => ref.path),
|
||||
MediaType: mediaPathRefs[0].mimeType,
|
||||
MediaTypes: mediaPathRefs.map((ref) => ref.mimeType),
|
||||
};
|
||||
const stageResult = await stageSandboxMedia({
|
||||
ctx: stagingCtx,
|
||||
@@ -871,14 +888,14 @@ async function prestageNonImageOffloads(params: {
|
||||
// (STAGED_MEDIA_MAX_BYTES = 5MB); check the returned `staged` map so any
|
||||
// missing source becomes a 5xx MediaOffloadError the client can retry.
|
||||
const stagedSources = stageResult.staged;
|
||||
const missing = nonImage.filter((ref) => !stagedSources.has(ref.path));
|
||||
const missing = mediaPathRefs.filter((ref) => !stagedSources.has(ref.path));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`non-image attachment staging incomplete: ${stagedSources.size}/${nonImage.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
|
||||
`attachment staging incomplete: ${stagedSources.size}/${mediaPathRefs.length} paths staged into sandbox workspace (missing: ${missing.map((ref) => ref.path).join(", ")})`,
|
||||
);
|
||||
}
|
||||
const stagedPaths = stagingCtx.MediaPaths ?? [];
|
||||
const stagedTypes = stagingCtx.MediaTypes ?? nonImage.map((ref) => ref.mimeType);
|
||||
const stagedTypes = stagingCtx.MediaTypes ?? mediaPathRefs.map((ref) => ref.mimeType);
|
||||
|
||||
// Keep stagedPaths sandbox-relative (e.g. `media/inbound/foo.pdf`) so the
|
||||
// agent inside the container can read them. Host-side media-understanding
|
||||
@@ -897,7 +914,7 @@ async function prestageNonImageOffloads(params: {
|
||||
throw err;
|
||||
}
|
||||
throw new MediaOffloadError(
|
||||
`[Gateway Error] Failed to stage non-image attachments into agent workspace: ${formatErrorMessage(err)}`,
|
||||
`[Gateway Error] Failed to stage attachments into agent workspace: ${formatErrorMessage(err)}`,
|
||||
{ cause: err },
|
||||
);
|
||||
}
|
||||
@@ -1896,9 +1913,9 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
let parsedImages: ChatImageContent[] = [];
|
||||
let imageOrder: PromptImageOrderEntry[] = [];
|
||||
let offloadedRefs: OffloadedRef[] = [];
|
||||
let nonImageMediaPaths: string[] = [];
|
||||
let nonImageMediaTypes: string[] = [];
|
||||
let nonImageMediaWorkspaceDir: string | undefined;
|
||||
let mediaPathOffloadPaths: string[] = [];
|
||||
let mediaPathOffloadTypes: string[] = [];
|
||||
let mediaPathOffloadWorkspaceDir: string | undefined;
|
||||
const timeoutMs = resolveAgentTimeoutMs({
|
||||
cfg,
|
||||
overrideMs: p.timeoutMs,
|
||||
@@ -1971,25 +1988,35 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
supportsSessionModelImages ||
|
||||
explicitOriginTargetsAcpSession(explicitOriginResult.value) ||
|
||||
explicitOriginTargetsPlugin;
|
||||
const routeImageOffloadsAsMediaPaths = !supportsImages;
|
||||
try {
|
||||
const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, {
|
||||
maxBytes: resolveChatAttachmentMaxBytes(cfg),
|
||||
log: context.logGateway,
|
||||
supportsImages,
|
||||
// chat.send routes non-image offloadedRefs into ctx.MediaPaths below
|
||||
// chat.send routes selected offloadedRefs into ctx.MediaPaths below
|
||||
// so the auto-reply stage pipeline can surface them to the agent.
|
||||
acceptNonImage: true,
|
||||
});
|
||||
parsedMessage = parsed.message;
|
||||
parsedMessage = stripTrailingOffloadedMediaMarkers(
|
||||
parsed.message,
|
||||
routeImageOffloadsAsMediaPaths
|
||||
? parsed.offloadedRefs.filter((ref) => ref.mimeType.startsWith("image/"))
|
||||
: [],
|
||||
);
|
||||
parsedImages = parsed.images;
|
||||
imageOrder = parsed.imageOrder;
|
||||
imageOrder = routeImageOffloadsAsMediaPaths ? [] : parsed.imageOrder;
|
||||
offloadedRefs = parsed.offloadedRefs;
|
||||
({
|
||||
paths: nonImageMediaPaths,
|
||||
types: nonImageMediaTypes,
|
||||
workspaceDir: nonImageMediaWorkspaceDir,
|
||||
} = await prestageNonImageOffloads({
|
||||
paths: mediaPathOffloadPaths,
|
||||
types: mediaPathOffloadTypes,
|
||||
workspaceDir: mediaPathOffloadWorkspaceDir,
|
||||
} = await prestageMediaPathOffloads({
|
||||
offloadedRefs,
|
||||
// Text-only image offloads need ctx.MediaPaths so media-understanding
|
||||
// can describe them via agents.defaults.imageModel. Vision-capable
|
||||
// image offloads stay as prompt refs for native image loading.
|
||||
includeImageRefs: routeImageOffloadsAsMediaPaths,
|
||||
cfg,
|
||||
sessionKey,
|
||||
agentId,
|
||||
@@ -2100,17 +2127,17 @@ export const chatHandlers: GatewayRequestHandlers = {
|
||||
GatewayClientScopes: client?.connect?.scopes ?? [],
|
||||
...pluginBoundMediaFields,
|
||||
};
|
||||
if (nonImageMediaPaths.length > 0) {
|
||||
// Inject non-image offloads via the same MsgContext fields the channel
|
||||
if (mediaPathOffloadPaths.length > 0) {
|
||||
// Inject offloads via the same MsgContext fields the channel
|
||||
// path uses so buildInboundMediaNote renders a real `[media attached:
|
||||
// <workspace-relative-path>]` line into the agent prompt. Marker
|
||||
// blocks the dispatch pipeline from re-running stageSandboxMedia; see
|
||||
// prestageNonImageOffloads.
|
||||
ctx.MediaPath = nonImageMediaPaths[0];
|
||||
ctx.MediaPaths = nonImageMediaPaths;
|
||||
ctx.MediaType = nonImageMediaTypes[0];
|
||||
ctx.MediaTypes = nonImageMediaTypes;
|
||||
ctx.MediaWorkspaceDir = nonImageMediaWorkspaceDir;
|
||||
// prestageMediaPathOffloads.
|
||||
ctx.MediaPath = mediaPathOffloadPaths[0];
|
||||
ctx.MediaPaths = mediaPathOffloadPaths;
|
||||
ctx.MediaType = mediaPathOffloadTypes[0];
|
||||
ctx.MediaTypes = mediaPathOffloadTypes;
|
||||
ctx.MediaWorkspaceDir = mediaPathOffloadWorkspaceDir;
|
||||
ctx.MediaStaged = true;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user