From 9a22cd212b1e4a01d571680a329657f38f0dd54c Mon Sep 17 00:00:00 2001
From: soloclz <soloclz.mail@gmail.com>
Date: Wed, 22 Apr 2026 02:47:39 +0800
Subject: [PATCH] fix(ollama): register media-understanding provider so image
 tool can route ollama/* models

Ollama chat models already support image inputs (extensions/ollama/src/stream.ts
extracts image parts and forwards them via the Ollama API), but the ollama
plugin did not register a MediaUnderstandingProvider. The image tool's provider
registry therefore had no 'ollama' entry, so requests like
`imageModel: 'ollama/qwen2.5vl:7b'` failed to resolve and fell back to
unrelated providers.

Register ollamaMediaUnderstandingProvider with:
- capabilities: ['image']
- describeImage/describeImages wired to the shared core helpers (reuses the
  same pi-ai complete path Ollama chat already goes through)
- no defaultModels or autoPriority: Ollama vision support depends on which
  model the user has pulled, so we don't pick a canonical default and don't
  auto-steal image duty from configured providers.

Fixes #69071 (and supersedes #60280).
---
 extensions/ollama/index.test.ts               | 38 +++++++++++++++++++
 extensions/ollama/index.ts                    |  2 +
 .../src/media-understanding-provider.ts       | 18 +++++++++
 3 files changed, 58 insertions(+)
 create mode 100644 extensions/ollama/src/media-understanding-provider.ts

diff --git a/extensions/ollama/index.test.ts b/extensions/ollama/index.test.ts
index 8e74a32df6f..43a86220371 100644
--- a/extensions/ollama/index.test.ts
+++ b/extensions/ollama/index.test.ts
@@ -495,4 +495,42 @@ describe("ollama plugin", () => {
     expect(baseStreamFn).toHaveBeenCalledTimes(1);
     expect(payloadSeen?.think).toBeUndefined();
   });
+
+  it("registers an image-capable media understanding provider so image tool can route ollama/*", () => {
+    const mediaProviders: Array<{
+      id: string;
+      capabilities?: string[];
+      defaultModels?: Record<string, string>;
+      autoPriority?: Record<string, number>;
+      describeImage?: unknown;
+      describeImages?: unknown;
+    }> = [];
+
+    plugin.register(
+      createTestPluginApi({
+        id: "ollama",
+        name: "Ollama",
+        source: "test",
+        config: {},
+        pluginConfig: {},
+        runtime: {} as never,
+        registerProvider() {},
+        registerMediaUnderstandingProvider(provider) {
+          mediaProviders.push(provider);
+        },
+      }),
+    );
+
+    expect(mediaProviders).toHaveLength(1);
+    const [ollamaMedia] = mediaProviders;
+    expect(ollamaMedia.id).toBe("ollama");
+    expect(ollamaMedia.capabilities).toEqual(["image"]);
+    expect(typeof ollamaMedia.describeImage).toBe("function");
+    expect(typeof ollamaMedia.describeImages).toBe("function");
+    // Intentional: no defaultModels or autoPriority. Ollama vision models are
+    // user-installed (llava, qwen2.5vl, …) with no universal default, and we
+    // don't want Ollama to auto-steal image duty from configured providers.
+    expect(ollamaMedia.defaultModels).toBeUndefined();
+    expect(ollamaMedia.autoPriority).toBeUndefined();
+  });
 });
diff --git a/extensions/ollama/index.ts b/extensions/ollama/index.ts
index 4542bd80ec0..29d77050f40 100644
--- a/extensions/ollama/index.ts
+++ b/extensions/ollama/index.ts
@@ -25,6 +25,7 @@ import {
   DEFAULT_OLLAMA_EMBEDDING_MODEL,
   createOllamaEmbeddingProvider,
 } from "./src/embedding-provider.js";
+import { ollamaMediaUnderstandingProvider } from "./src/media-understanding-provider.js";
 import { ollamaMemoryEmbeddingProviderAdapter } from "./src/memory-embedding-adapter.js";
 import {
   createConfiguredOllamaCompatStreamWrapper,
@@ -55,6 +56,7 @@ export default definePluginEntry({
   description: "Bundled Ollama provider plugin",
   register(api: OpenClawPluginApi) {
     api.registerMemoryEmbeddingProvider(ollamaMemoryEmbeddingProviderAdapter);
+    api.registerMediaUnderstandingProvider(ollamaMediaUnderstandingProvider);
     const pluginConfig = (api.pluginConfig ?? {}) as OllamaPluginConfig;
     api.registerWebSearchProvider(createOllamaWebSearchProvider());
     api.registerProvider({
diff --git a/extensions/ollama/src/media-understanding-provider.ts b/extensions/ollama/src/media-understanding-provider.ts
new file mode 100644
index 00000000000..307e70862d3
--- /dev/null
+++ b/extensions/ollama/src/media-understanding-provider.ts
@@ -0,0 +1,18 @@
+import {
+  describeImageWithModel,
+  describeImagesWithModel,
+  type MediaUnderstandingProvider,
+} from "openclaw/plugin-sdk/media-understanding";
+import { OLLAMA_PROVIDER_ID } from "./discovery-shared.js";
+
+// Ollama vision support depends on which models the user has pulled (llava,
+// qwen2.5vl, llama3.2-vision, …) — there is no single canonical default. We
+// register the provider so the image tool can route `ollama/<vision-model>`
+// requests, but leave `defaultModels` and `autoPriority` unset so Ollama
+// only participates when the user explicitly configures an image model.
+export const ollamaMediaUnderstandingProvider: MediaUnderstandingProvider = {
+  id: OLLAMA_PROVIDER_ID,
+  capabilities: ["image"],
+  describeImage: describeImageWithModel,
+  describeImages: describeImagesWithModel,
+};