feat(ollama): detect vision capability from /api/show and set image input

This commit is contained in:
Bruce MacDonald
2026-04-06 17:13:21 -07:00
parent f4fcaa09a3
commit 85f85d1036
6 changed files with 133 additions and 31 deletions

View File

@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
- Tools/media generation: auto-fallback across auth-backed image, music, and video providers by default, and remap fallback size, aspect ratio, resolution, and duration hints to the closest supported option instead of dropping intent on provider switches.
- Tools/media generation: report applied fallback geometry and duration settings consistently in tool results, add a shared normalization contract for image/music/video runtimes, and simplify the bundled image-generation-core runtime test to only verify the plugin-sdk re-export seam.
- Gateway/sessions: add persisted compaction checkpoints plus Sessions UI branch/restore actions so operators can inspect and recover pre-compaction session state. (#62146) Thanks @scoootscooob.
- Providers/Ollama: detect vision capability from the `/api/show` response and set image input on models that support it so Ollama vision models accept image attachments. (#62193) Thanks @BruceMacD.
### Fixes

View File

@@ -11,7 +11,9 @@ export {
fetchOllamaModels,
isReasoningModelHeuristic,
queryOllamaContextWindow,
queryOllamaModelShowInfo,
resolveOllamaApiBase,
type OllamaModelShowInfo,
type OllamaModelWithContext,
type OllamaTagModel,
type OllamaTagsResponse,

View File

@@ -1,6 +1,7 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { jsonResponse, requestBodyText, requestUrl } from "../../../src/test-helpers/http.js";
import {
buildOllamaModelDefinition,
enrichOllamaModelsWithContext,
resolveOllamaApiBase,
type OllamaTagModel,
@@ -16,7 +17,7 @@ describe("ollama provider models", () => {
expect(resolveOllamaApiBase("http://127.0.0.1:11434///")).toBe("http://127.0.0.1:11434");
});
it("enriches discovered models with context windows from /api/show", async () => {
it("sets discovered models with context windows from /api/show", async () => {
const models: OllamaTagModel[] = [{ name: "llama3:8b" }, { name: "deepseek-r1:14b" }];
const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => {
const url = requestUrl(input);
@@ -34,8 +35,63 @@ describe("ollama provider models", () => {
const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models);
expect(enriched).toEqual([
{ name: "llama3:8b", contextWindow: 65536 },
{ name: "deepseek-r1:14b", contextWindow: undefined },
{ name: "llama3:8b", contextWindow: 65536, capabilities: undefined },
{ name: "deepseek-r1:14b", contextWindow: undefined, capabilities: undefined },
]);
});
it("sets models with vision capability from /api/show capabilities", async () => {
const models: OllamaTagModel[] = [{ name: "kimi-k2.5:cloud" }, { name: "glm-5:cloud" }];
const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => {
const url = requestUrl(input);
if (!url.endsWith("/api/show")) {
throw new Error(`Unexpected fetch: ${url}`);
}
const body = JSON.parse(requestBodyText(init?.body)) as { name?: string };
if (body.name === "kimi-k2.5:cloud") {
return jsonResponse({
model_info: { "kimi-k2.context_length": 262144 },
capabilities: ["vision", "thinking", "completion", "tools"],
});
}
if (body.name === "glm-5:cloud") {
return jsonResponse({
model_info: { "glm5.context_length": 202752 },
capabilities: ["thinking", "completion", "tools"],
});
}
return jsonResponse({});
});
vi.stubGlobal("fetch", fetchMock);
const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models);
expect(enriched).toEqual([
{
name: "kimi-k2.5:cloud",
contextWindow: 262144,
capabilities: ["vision", "thinking", "completion", "tools"],
},
{
name: "glm-5:cloud",
contextWindow: 202752,
capabilities: ["thinking", "completion", "tools"],
},
]);
});
it("buildOllamaModelDefinition sets input to text+image when vision capability is present", () => {
const visionModel = buildOllamaModelDefinition("kimi-k2.5:cloud", 262144, [
"vision",
"completion",
"tools",
]);
expect(visionModel.input).toEqual(["text", "image"]);
const textModel = buildOllamaModelDefinition("glm-5:cloud", 202752, ["completion", "tools"]);
expect(textModel.input).toEqual(["text"]);
const noCapabilities = buildOllamaModelDefinition("unknown-model", 65536);
expect(noCapabilities.input).toEqual(["text"]);
});
});

View File

@@ -25,6 +25,7 @@ export type OllamaTagsResponse = {
export type OllamaModelWithContext = OllamaTagModel & {
contextWindow?: number;
capabilities?: string[];
};
const OLLAMA_SHOW_CONCURRENCY = 8;
@@ -56,10 +57,15 @@ export function resolveOllamaApiBase(configuredBaseUrl?: string): string {
return trimmed.replace(/\/v1$/i, "");
}
export async function queryOllamaContextWindow(
export type OllamaModelShowInfo = {
contextWindow?: number;
capabilities?: string[];
};
export async function queryOllamaModelShowInfo(
apiBase: string,
modelName: string,
): Promise<number | undefined> {
): Promise<OllamaModelShowInfo> {
try {
const { response, release } = await fetchWithSsrFGuard({
url: `${apiBase}/api/show`,
@@ -74,33 +80,51 @@ export async function queryOllamaContextWindow(
});
try {
if (!response.ok) {
return undefined;
return {};
}
const data = (await response.json()) as { model_info?: Record<string, unknown> };
if (!data.model_info) {
return undefined;
}
for (const [key, value] of Object.entries(data.model_info)) {
if (
key.endsWith(".context_length") &&
typeof value === "number" &&
Number.isFinite(value)
) {
const contextWindow = Math.floor(value);
if (contextWindow > 0) {
return contextWindow;
const data = (await response.json()) as {
model_info?: Record<string, unknown>;
capabilities?: unknown;
};
let contextWindow: number | undefined;
if (data.model_info) {
for (const [key, value] of Object.entries(data.model_info)) {
if (
key.endsWith(".context_length") &&
typeof value === "number" &&
Number.isFinite(value)
) {
const ctx = Math.floor(value);
if (ctx > 0) {
contextWindow = ctx;
break;
}
}
}
}
return undefined;
const capabilities = Array.isArray(data.capabilities)
? (data.capabilities as unknown[]).filter((c): c is string => typeof c === "string")
: undefined;
return { contextWindow, capabilities };
} finally {
await release();
}
} catch {
return undefined;
return {};
}
}
/** @deprecated Use queryOllamaModelShowInfo instead. */
export async function queryOllamaContextWindow(
apiBase: string,
modelName: string,
): Promise<number | undefined> {
return (await queryOllamaModelShowInfo(apiBase, modelName)).contextWindow;
}
export async function enrichOllamaModelsWithContext(
apiBase: string,
models: OllamaTagModel[],
@@ -111,10 +135,14 @@ export async function enrichOllamaModelsWithContext(
for (let index = 0; index < models.length; index += concurrency) {
const batch = models.slice(index, index + concurrency);
const batchResults = await Promise.all(
batch.map(async (model) => ({
...model,
contextWindow: await queryOllamaContextWindow(apiBase, model.name),
})),
batch.map(async (model) => {
const showInfo = await queryOllamaModelShowInfo(apiBase, model.name);
return {
...model,
contextWindow: showInfo.contextWindow,
capabilities: showInfo.capabilities,
};
}),
);
enriched.push(...batchResults);
}
@@ -128,12 +156,15 @@ export function isReasoningModelHeuristic(modelId: string): boolean {
export function buildOllamaModelDefinition(
modelId: string,
contextWindow?: number,
capabilities?: string[],
): ModelDefinitionConfig {
const hasVision = capabilities?.includes("vision") ?? false;
const input: ("text" | "image")[] = hasVision ? ["text", "image"] : ["text"];
return {
id: modelId,
name: modelId,
reasoning: isReasoningModelHeuristic(modelId),
input: ["text"],
input,
cost: OLLAMA_DEFAULT_COST,
contextWindow: contextWindow ?? OLLAMA_DEFAULT_CONTEXT_WINDOW,
maxTokens: OLLAMA_DEFAULT_MAX_TOKENS,

View File

@@ -204,7 +204,8 @@ describe("ollama setup", () => {
isRemote: false,
openUrl: vi.fn(async () => undefined),
});
const modelIds = result.config.models?.providers?.ollama?.models?.map((m) => m.id);
const models = result.config.models?.providers?.ollama?.models;
const modelIds = models?.map((m) => m.id);
expect(modelIds).toEqual([
"kimi-k2.5:cloud",
@@ -214,6 +215,10 @@ describe("ollama setup", () => {
"glm-4.7-flash",
"deepseek-r1:14b",
]);
expect(models?.find((model) => model.id === "kimi-k2.5:cloud")?.input).toEqual([
"text",
"image",
]);
});
it("uses /api/show context windows when building Ollama model configs", async () => {

View File

@@ -245,9 +245,14 @@ function buildOllamaModelsConfig(
modelNames: string[],
discoveredModelsByName?: Map<string, OllamaModelWithContext>,
) {
return modelNames.map((name) =>
buildOllamaModelDefinition(name, discoveredModelsByName?.get(name)?.contextWindow),
);
return modelNames.map((name) => {
const discovered = discoveredModelsByName?.get(name);
// Suggested cloud models may be injected before `/api/tags` exposes them,
// so keep Kimi vision-capable during setup even without discovered metadata.
const capabilities =
discovered?.capabilities ?? (name === "kimi-k2.5:cloud" ? ["vision"] : undefined);
return buildOllamaModelDefinition(name, discovered?.contextWindow, capabilities);
});
}
function applyOllamaProviderConfig(
@@ -299,7 +304,9 @@ export async function buildOllamaProvider(
return {
baseUrl: apiBase,
api: "ollama",
models: discovered.map((model) => buildOllamaModelDefinition(model.name, model.contextWindow)),
models: discovered.map((model) =>
buildOllamaModelDefinition(model.name, model.contextWindow, model.capabilities),
),
};
}