From 178a314a4c1cfa98ac0e8545707162427e38f9c5 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Fri, 24 Apr 2026 00:42:23 +0100
Subject: [PATCH] fix: restore OpenRouter vision prompts

---
 CHANGELOG.md                          |  1 +
 src/media-understanding/image.test.ts | 47 +++++++++++++++++++++++++++
 src/media-understanding/image.ts      | 36 ++++++++++++++++----
 3 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f07739a3a64..fba67de5fe7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Agents/transport: stop embedded runs from lowering the process-wide undici stream timeouts, so slow Gemini image generation and other long-running provider requests no longer inherit short run-attempt headers timeouts. Fixes #70423. Thanks @giangthb.
+- Providers/OpenRouter: send image-understanding prompts as user text before image parts, restoring non-empty vision responses for OpenRouter multimodal models. Fixes #70410.
 - Memory/QMD: recreate stale managed QMD collections when startup repair finds the collection name already exists, so root memory narrows back to `MEMORY.md` instead of staying on broad workspace markdown indexing.
 - Agents/OpenAI: surface selected-model capacity failures from PI, Codex, and auto-reply harness paths with a model-switch hint instead of the generic empty-response error. Thanks @vincentkoc.
 - Providers/OpenAI: route `openai/gpt-image-2` through configured Codex OAuth directly when an `openai-codex` profile is active, instead of probing `OPENAI_API_KEY` first.
diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts
index 7f193730e9f..5cfd61090be 100644
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -249,6 +249,53 @@ describe("describeImageWithModel", () => {
     expect(context?.messages?.[0]?.content).toHaveLength(1);
   });
 
+  it("places OpenRouter image prompts in user content before images", async () => {
+    discoverModelsMock.mockReturnValue({
+      find: vi.fn(() => ({
+        api: "openai-completions",
+        provider: "openrouter",
+        id: "google/gemini-2.5-flash",
+        input: ["text", "image"],
+        baseUrl: "https://openrouter.ai/api/v1",
+      })),
+    });
+    completeMock.mockResolvedValue({
+      role: "assistant",
+      api: "openai-completions",
+      provider: "openrouter",
+      model: "google/gemini-2.5-flash",
+      stopReason: "stop",
+      timestamp: Date.now(),
+      content: [{ type: "text", text: "openrouter ok" }],
+    });
+
+    const result = await describeImageWithModel({
+      cfg: {},
+      agentDir: "/tmp/openclaw-agent",
+      provider: "openrouter",
+      model: "google/gemini-2.5-flash",
+      buffer: Buffer.from("png-bytes"),
+      fileName: "image.png",
+      mime: "image/png",
+      prompt: "Describe the image.",
+      timeoutMs: 1000,
+    });
+
+    expect(result).toEqual({
+      text: "openrouter ok",
+      model: "google/gemini-2.5-flash",
+    });
+    const [, context] = completeMock.mock.calls[0] ?? [];
+    expect(context?.systemPrompt).toBeUndefined();
+    expect(context?.messages?.[0]?.content).toEqual([
+      { type: "text", text: "Describe the image." },
+      expect.objectContaining({
+        type: "image",
+        mimeType: "image/png",
+      }),
+    ]);
+  });
+
   it.each([
     {
       name: "direct OpenAI Responses baseUrl",
diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts
index 1ee3ec5bdfc..83073871b16 100644
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -132,23 +132,43 @@ async function resolveImageRuntime(params: {
 function buildImageContext(
   prompt: string,
   images: Array<{ buffer: Buffer; mime?: string }>,
+  opts?: { promptInUserContent?: boolean },
 ): Context {
+  const imageContent = images.map((image) => ({
+    type: "image" as const,
+    data: image.buffer.toString("base64"),
+    mimeType: image.mime ?? "image/jpeg",
+  }));
+  const content = opts?.promptInUserContent
+    ? [{ type: "text" as const, text: prompt }, ...imageContent]
+    : imageContent;
+
   return {
-    systemPrompt: prompt,
+    ...(opts?.promptInUserContent ? {} : { systemPrompt: prompt }),
     messages: [
       {
         role: "user",
-        content: images.map((image) => ({
-          type: "image" as const,
-          data: image.buffer.toString("base64"),
-          mimeType: image.mime ?? "image/jpeg",
-        })),
+        content,
         timestamp: Date.now(),
       },
     ],
   };
 }
 
+function shouldPlaceImagePromptInUserContent(model: Model<Api>): boolean {
+  const capabilities = resolveProviderRequestCapabilities({
+    provider: model.provider,
+    api: model.api,
+    baseUrl: model.baseUrl,
+    capability: "image",
+    transport: "media-understanding",
+  });
+  return (
+    capabilities.endpointClass === "openrouter" ||
+    (model.provider.toLowerCase() === "openrouter" && capabilities.endpointClass === "default")
+  );
+}
+
 async function describeImagesWithMinimax(params: {
   apiKey: string;
   modelId: string;
@@ -252,7 +272,9 @@ export async function describeImagesWithModel(
     agentDir: params.agentDir,
   });
 
-  const context = buildImageContext(prompt, params.images);
+  const context = buildImageContext(prompt, params.images, {
+    promptInUserContent: shouldPlaceImagePromptInUserContent(model),
+  });
   const controller = new AbortController();
   const timeout =
     typeof params.timeoutMs === "number" &&