fix: restore OpenRouter vision prompts

2026-05-06 10:20:42 +00:00 · 2026-04-24 00:42:23 +01:00
parent d16b879334
commit 178a314a4c
3 changed files with 77 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - Agents/transport: stop embedded runs from lowering the process-wide undici stream timeouts, so slow Gemini image generation and other long-running provider requests no longer inherit short run-attempt headers timeouts. Fixes #70423. Thanks @giangthb.
+- Providers/OpenRouter: send image-understanding prompts as user text before image parts, restoring non-empty vision responses for OpenRouter multimodal models. Fixes #70410.
 - Memory/QMD: recreate stale managed QMD collections when startup repair finds the collection name already exists, so root memory narrows back to `MEMORY.md` instead of staying on broad workspace markdown indexing.
 - Agents/OpenAI: surface selected-model capacity failures from PI, Codex, and auto-reply harness paths with a model-switch hint instead of the generic empty-response error. Thanks @vincentkoc.
 - Providers/OpenAI: route `openai/gpt-image-2` through configured Codex OAuth directly when an `openai-codex` profile is active, instead of probing `OPENAI_API_KEY` first.
--- a/src/media-understanding/image.test.ts
+++ b/src/media-understanding/image.test.ts
@@ -249,6 +249,53 @@ describe("describeImageWithModel", () => {
    expect(context?.messages?.[0]?.content).toHaveLength(1);
  });

+  it("places OpenRouter image prompts in user content before images", async () => {
+    discoverModelsMock.mockReturnValue({
+      find: vi.fn(() => ({
+        api: "openai-completions",
+        provider: "openrouter",
+        id: "google/gemini-2.5-flash",
+        input: ["text", "image"],
+        baseUrl: "https://openrouter.ai/api/v1",
+      })),
+    });
+    completeMock.mockResolvedValue({
+      role: "assistant",
+      api: "openai-completions",
+      provider: "openrouter",
+      model: "google/gemini-2.5-flash",
+      stopReason: "stop",
+      timestamp: Date.now(),
+      content: [{ type: "text", text: "openrouter ok" }],
+    });
+
+    const result = await describeImageWithModel({
+      cfg: {},
+      agentDir: "/tmp/openclaw-agent",
+      provider: "openrouter",
+      model: "google/gemini-2.5-flash",
+      buffer: Buffer.from("png-bytes"),
+      fileName: "image.png",
+      mime: "image/png",
+      prompt: "Describe the image.",
+      timeoutMs: 1000,
+    });
+
+    expect(result).toEqual({
+      text: "openrouter ok",
+      model: "google/gemini-2.5-flash",
+    });
+    const [, context] = completeMock.mock.calls[0] ?? [];
+    expect(context?.systemPrompt).toBeUndefined();
+    expect(context?.messages?.[0]?.content).toEqual([
+      { type: "text", text: "Describe the image." },
+      expect.objectContaining({
+        type: "image",
+        mimeType: "image/png",
+      }),
+    ]);
+  });
+
  it.each([
    {
      name: "direct OpenAI Responses baseUrl",
--- a/src/media-understanding/image.ts
+++ b/src/media-understanding/image.ts
@@ -132,23 +132,43 @@ async function resolveImageRuntime(params: {
 function buildImageContext(
  prompt: string,
  images: Array<{ buffer: Buffer; mime?: string }>,
+  opts?: { promptInUserContent?: boolean },
 ): Context {
+  const imageContent = images.map((image) => ({
+    type: "image" as const,
+    data: image.buffer.toString("base64"),
+    mimeType: image.mime ?? "image/jpeg",
+  }));
+  const content = opts?.promptInUserContent
+    ? [{ type: "text" as const, text: prompt }, ...imageContent]
+    : imageContent;
+
  return {
-    systemPrompt: prompt,
+    ...(opts?.promptInUserContent ? {} : { systemPrompt: prompt }),
    messages: [
      {
        role: "user",
-        content: images.map((image) => ({
-          type: "image" as const,
-          data: image.buffer.toString("base64"),
-          mimeType: image.mime ?? "image/jpeg",
-        })),
+        content,
        timestamp: Date.now(),
      },
    ],
  };
 }

+function shouldPlaceImagePromptInUserContent(model: Model<Api>): boolean {
+  const capabilities = resolveProviderRequestCapabilities({
+    provider: model.provider,
+    api: model.api,
+    baseUrl: model.baseUrl,
+    capability: "image",
+    transport: "media-understanding",
+  });
+  return (
+    capabilities.endpointClass === "openrouter" ||
+    (model.provider.toLowerCase() === "openrouter" && capabilities.endpointClass === "default")
+  );
+}
+
 async function describeImagesWithMinimax(params: {
  apiKey: string;
  modelId: string;
@@ -252,7 +272,9 @@ export async function describeImagesWithModel(
    agentDir: params.agentDir,
  });

-  const context = buildImageContext(prompt, params.images);
+  const context = buildImageContext(prompt, params.images, {
+    promptInUserContent: shouldPlaceImagePromptInUserContent(model),
+  });
  const controller = new AbortController();
  const timeout =
    typeof params.timeoutMs === "number" &&