diff --git a/CHANGELOG.md b/CHANGELOG.md index 513c6aa7196..5ae7529c86b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai - Tools/media generation: auto-fallback across auth-backed image, music, and video providers by default, and remap fallback size, aspect ratio, resolution, and duration hints to the closest supported option instead of dropping intent on provider switches. - Tools/media generation: report applied fallback geometry and duration settings consistently in tool results, add a shared normalization contract for image/music/video runtimes, and simplify the bundled image-generation-core runtime test to only verify the plugin-sdk re-export seam. - Gateway/sessions: add persisted compaction checkpoints plus Sessions UI branch/restore actions so operators can inspect and recover pre-compaction session state. (#62146) Thanks @scoootscooob. +- Providers/Ollama: detect vision capability from the `/api/show` response and set image input on models that support it so Ollama vision models accept image attachments. (#62193) Thanks @BruceMacD. ### Fixes diff --git a/extensions/ollama/api.ts b/extensions/ollama/api.ts index 1620f6e5049..0d18cf8d562 100644 --- a/extensions/ollama/api.ts +++ b/extensions/ollama/api.ts @@ -11,7 +11,9 @@ export { fetchOllamaModels, isReasoningModelHeuristic, queryOllamaContextWindow, + queryOllamaModelShowInfo, resolveOllamaApiBase, + type OllamaModelShowInfo, type OllamaModelWithContext, type OllamaTagModel, type OllamaTagsResponse, diff --git a/extensions/ollama/src/provider-models.test.ts b/extensions/ollama/src/provider-models.test.ts index 11dd985b460..98b676c127f 100644 --- a/extensions/ollama/src/provider-models.test.ts +++ b/extensions/ollama/src/provider-models.test.ts @@ -1,6 +1,7 @@ import { afterEach, describe, expect, it, vi } from "vitest"; import { jsonResponse, requestBodyText, requestUrl } from "../../../src/test-helpers/http.js"; import { + buildOllamaModelDefinition, enrichOllamaModelsWithContext, resolveOllamaApiBase, type OllamaTagModel, @@ -16,7 +17,7 @@ describe("ollama provider models", () => { expect(resolveOllamaApiBase("http://127.0.0.1:11434///")).toBe("http://127.0.0.1:11434"); }); - it("enriches discovered models with context windows from /api/show", async () => { + it("sets discovered models with context windows from /api/show", async () => { const models: OllamaTagModel[] = [{ name: "llama3:8b" }, { name: "deepseek-r1:14b" }]; const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => { const url = requestUrl(input); @@ -34,8 +35,63 @@ describe("ollama provider models", () => { const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models); expect(enriched).toEqual([ - { name: "llama3:8b", contextWindow: 65536 }, - { name: "deepseek-r1:14b", contextWindow: undefined }, + { name: "llama3:8b", contextWindow: 65536, capabilities: undefined }, + { name: "deepseek-r1:14b", contextWindow: undefined, capabilities: undefined }, ]); }); + + it("sets models with vision capability from /api/show capabilities", async () => { + const models: OllamaTagModel[] = [{ name: "kimi-k2.5:cloud" }, { name: "glm-5:cloud" }]; + const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => { + const url = requestUrl(input); + if (!url.endsWith("/api/show")) { + throw new Error(`Unexpected fetch: ${url}`); + } + const body = JSON.parse(requestBodyText(init?.body)) as { name?: string }; + if (body.name === "kimi-k2.5:cloud") { + return jsonResponse({ + model_info: { "kimi-k2.context_length": 262144 }, + capabilities: ["vision", "thinking", "completion", "tools"], + }); + } + if (body.name === "glm-5:cloud") { + return jsonResponse({ + model_info: { "glm5.context_length": 202752 }, + capabilities: ["thinking", "completion", "tools"], + }); + } + return jsonResponse({}); + }); + vi.stubGlobal("fetch", fetchMock); + + const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models); + + expect(enriched).toEqual([ + { + name: "kimi-k2.5:cloud", + contextWindow: 262144, + capabilities: ["vision", "thinking", "completion", "tools"], + }, + { + name: "glm-5:cloud", + contextWindow: 202752, + capabilities: ["thinking", "completion", "tools"], + }, + ]); + }); + + it("buildOllamaModelDefinition sets input to text+image when vision capability is present", () => { + const visionModel = buildOllamaModelDefinition("kimi-k2.5:cloud", 262144, [ + "vision", + "completion", + "tools", + ]); + expect(visionModel.input).toEqual(["text", "image"]); + + const textModel = buildOllamaModelDefinition("glm-5:cloud", 202752, ["completion", "tools"]); + expect(textModel.input).toEqual(["text"]); + + const noCapabilities = buildOllamaModelDefinition("unknown-model", 65536); + expect(noCapabilities.input).toEqual(["text"]); + }); }); diff --git a/extensions/ollama/src/provider-models.ts b/extensions/ollama/src/provider-models.ts index e5b94c38f01..3dce0ef9887 100644 --- a/extensions/ollama/src/provider-models.ts +++ b/extensions/ollama/src/provider-models.ts @@ -25,6 +25,7 @@ export type OllamaTagsResponse = { export type OllamaModelWithContext = OllamaTagModel & { contextWindow?: number; + capabilities?: string[]; }; const OLLAMA_SHOW_CONCURRENCY = 8; @@ -56,10 +57,15 @@ export function resolveOllamaApiBase(configuredBaseUrl?: string): string { return trimmed.replace(/\/v1$/i, ""); } -export async function queryOllamaContextWindow( +export type OllamaModelShowInfo = { + contextWindow?: number; + capabilities?: string[]; +}; + +export async function queryOllamaModelShowInfo( apiBase: string, modelName: string, -): Promise { +): Promise { try { const { response, release } = await fetchWithSsrFGuard({ url: `${apiBase}/api/show`, @@ -74,33 +80,51 @@ export async function queryOllamaContextWindow( }); try { if (!response.ok) { - return undefined; + return {}; } - const data = (await response.json()) as { model_info?: Record }; - if (!data.model_info) { - return undefined; - } - for (const [key, value] of Object.entries(data.model_info)) { - if ( - key.endsWith(".context_length") && - typeof value === "number" && - Number.isFinite(value) - ) { - const contextWindow = Math.floor(value); - if (contextWindow > 0) { - return contextWindow; + const data = (await response.json()) as { + model_info?: Record; + capabilities?: unknown; + }; + + let contextWindow: number | undefined; + if (data.model_info) { + for (const [key, value] of Object.entries(data.model_info)) { + if ( + key.endsWith(".context_length") && + typeof value === "number" && + Number.isFinite(value) + ) { + const ctx = Math.floor(value); + if (ctx > 0) { + contextWindow = ctx; + break; + } } } } - return undefined; + + const capabilities = Array.isArray(data.capabilities) + ? (data.capabilities as unknown[]).filter((c): c is string => typeof c === "string") + : undefined; + + return { contextWindow, capabilities }; } finally { await release(); } } catch { - return undefined; + return {}; } } +/** @deprecated Use queryOllamaModelShowInfo instead. */ +export async function queryOllamaContextWindow( + apiBase: string, + modelName: string, +): Promise { + return (await queryOllamaModelShowInfo(apiBase, modelName)).contextWindow; +} + export async function enrichOllamaModelsWithContext( apiBase: string, models: OllamaTagModel[], @@ -111,10 +135,14 @@ export async function enrichOllamaModelsWithContext( for (let index = 0; index < models.length; index += concurrency) { const batch = models.slice(index, index + concurrency); const batchResults = await Promise.all( - batch.map(async (model) => ({ - ...model, - contextWindow: await queryOllamaContextWindow(apiBase, model.name), - })), + batch.map(async (model) => { + const showInfo = await queryOllamaModelShowInfo(apiBase, model.name); + return { + ...model, + contextWindow: showInfo.contextWindow, + capabilities: showInfo.capabilities, + }; + }), ); enriched.push(...batchResults); } @@ -128,12 +156,15 @@ export function isReasoningModelHeuristic(modelId: string): boolean { export function buildOllamaModelDefinition( modelId: string, contextWindow?: number, + capabilities?: string[], ): ModelDefinitionConfig { + const hasVision = capabilities?.includes("vision") ?? false; + const input: ("text" | "image")[] = hasVision ? ["text", "image"] : ["text"]; return { id: modelId, name: modelId, reasoning: isReasoningModelHeuristic(modelId), - input: ["text"], + input, cost: OLLAMA_DEFAULT_COST, contextWindow: contextWindow ?? OLLAMA_DEFAULT_CONTEXT_WINDOW, maxTokens: OLLAMA_DEFAULT_MAX_TOKENS, diff --git a/extensions/ollama/src/setup.test.ts b/extensions/ollama/src/setup.test.ts index c16c46680b2..4b4df938243 100644 --- a/extensions/ollama/src/setup.test.ts +++ b/extensions/ollama/src/setup.test.ts @@ -204,7 +204,8 @@ describe("ollama setup", () => { isRemote: false, openUrl: vi.fn(async () => undefined), }); - const modelIds = result.config.models?.providers?.ollama?.models?.map((m) => m.id); + const models = result.config.models?.providers?.ollama?.models; + const modelIds = models?.map((m) => m.id); expect(modelIds).toEqual([ "kimi-k2.5:cloud", @@ -214,6 +215,10 @@ describe("ollama setup", () => { "glm-4.7-flash", "deepseek-r1:14b", ]); + expect(models?.find((model) => model.id === "kimi-k2.5:cloud")?.input).toEqual([ + "text", + "image", + ]); }); it("uses /api/show context windows when building Ollama model configs", async () => { diff --git a/extensions/ollama/src/setup.ts b/extensions/ollama/src/setup.ts index 6260d619deb..631813d3b34 100644 --- a/extensions/ollama/src/setup.ts +++ b/extensions/ollama/src/setup.ts @@ -245,9 +245,14 @@ function buildOllamaModelsConfig( modelNames: string[], discoveredModelsByName?: Map, ) { - return modelNames.map((name) => - buildOllamaModelDefinition(name, discoveredModelsByName?.get(name)?.contextWindow), - ); + return modelNames.map((name) => { + const discovered = discoveredModelsByName?.get(name); + // Suggested cloud models may be injected before `/api/tags` exposes them, + // so keep Kimi vision-capable during setup even without discovered metadata. + const capabilities = + discovered?.capabilities ?? (name === "kimi-k2.5:cloud" ? ["vision"] : undefined); + return buildOllamaModelDefinition(name, discovered?.contextWindow, capabilities); + }); } function applyOllamaProviderConfig( @@ -299,7 +304,9 @@ export async function buildOllamaProvider( return { baseUrl: apiBase, api: "ollama", - models: discovered.map((model) => buildOllamaModelDefinition(model.name, model.contextWindow)), + models: discovered.map((model) => + buildOllamaModelDefinition(model.name, model.contextWindow, model.capabilities), + ), }; }