From 178a314a4c1cfa98ac0e8545707162427e38f9c5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 00:42:23 +0100 Subject: [PATCH] fix: restore OpenRouter vision prompts --- CHANGELOG.md | 1 + src/media-understanding/image.test.ts | 47 +++++++++++++++++++++++++++ src/media-understanding/image.ts | 36 ++++++++++++++++---- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f07739a3a64..fba67de5fe7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Agents/transport: stop embedded runs from lowering the process-wide undici stream timeouts, so slow Gemini image generation and other long-running provider requests no longer inherit short run-attempt headers timeouts. Fixes #70423. Thanks @giangthb. +- Providers/OpenRouter: send image-understanding prompts as user text before image parts, restoring non-empty vision responses for OpenRouter multimodal models. Fixes #70410. - Memory/QMD: recreate stale managed QMD collections when startup repair finds the collection name already exists, so root memory narrows back to `MEMORY.md` instead of staying on broad workspace markdown indexing. - Agents/OpenAI: surface selected-model capacity failures from PI, Codex, and auto-reply harness paths with a model-switch hint instead of the generic empty-response error. Thanks @vincentkoc. - Providers/OpenAI: route `openai/gpt-image-2` through configured Codex OAuth directly when an `openai-codex` profile is active, instead of probing `OPENAI_API_KEY` first. diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 7f193730e9f..5cfd61090be 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -249,6 +249,53 @@ describe("describeImageWithModel", () => { expect(context?.messages?.[0]?.content).toHaveLength(1); }); + it("places OpenRouter image prompts in user content before images", async () => { + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => ({ + api: "openai-completions", + provider: "openrouter", + id: "google/gemini-2.5-flash", + input: ["text", "image"], + baseUrl: "https://openrouter.ai/api/v1", + })), + }); + completeMock.mockResolvedValue({ + role: "assistant", + api: "openai-completions", + provider: "openrouter", + model: "google/gemini-2.5-flash", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "openrouter ok" }], + }); + + const result = await describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "openrouter", + model: "google/gemini-2.5-flash", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "openrouter ok", + model: "google/gemini-2.5-flash", + }); + const [, context] = completeMock.mock.calls[0] ?? []; + expect(context?.systemPrompt).toBeUndefined(); + expect(context?.messages?.[0]?.content).toEqual([ + { type: "text", text: "Describe the image." }, + expect.objectContaining({ + type: "image", + mimeType: "image/png", + }), + ]); + }); + it.each([ { name: "direct OpenAI Responses baseUrl", diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 1ee3ec5bdfc..83073871b16 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -132,23 +132,43 @@ async function resolveImageRuntime(params: { function buildImageContext( prompt: string, images: Array<{ buffer: Buffer; mime?: string }>, + opts?: { promptInUserContent?: boolean }, ): Context { + const imageContent = images.map((image) => ({ + type: "image" as const, + data: image.buffer.toString("base64"), + mimeType: image.mime ?? "image/jpeg", + })); + const content = opts?.promptInUserContent + ? [{ type: "text" as const, text: prompt }, ...imageContent] + : imageContent; + return { - systemPrompt: prompt, + ...(opts?.promptInUserContent ? {} : { systemPrompt: prompt }), messages: [ { role: "user", - content: images.map((image) => ({ - type: "image" as const, - data: image.buffer.toString("base64"), - mimeType: image.mime ?? "image/jpeg", - })), + content, timestamp: Date.now(), }, ], }; } +function shouldPlaceImagePromptInUserContent(model: Model): boolean { + const capabilities = resolveProviderRequestCapabilities({ + provider: model.provider, + api: model.api, + baseUrl: model.baseUrl, + capability: "image", + transport: "media-understanding", + }); + return ( + capabilities.endpointClass === "openrouter" || + (model.provider.toLowerCase() === "openrouter" && capabilities.endpointClass === "default") + ); +} + async function describeImagesWithMinimax(params: { apiKey: string; modelId: string; @@ -252,7 +272,9 @@ export async function describeImagesWithModel( agentDir: params.agentDir, }); - const context = buildImageContext(prompt, params.images); + const context = buildImageContext(prompt, params.images, { + promptInUserContent: shouldPlaceImagePromptInUserContent(model), + }); const controller = new AbortController(); const timeout = typeof params.timeoutMs === "number" &&