mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 09:41:11 +00:00
feat(ollama): detect vision capability from /api/show and set image i… (#62193)
Merged via squash.
Prepared head SHA: 85f85d1036
Co-authored-by: BruceMacD <5853428+BruceMacD@users.noreply.github.com>
Co-authored-by: BruceMacD <5853428+BruceMacD@users.noreply.github.com>
Reviewed-by: @BruceMacD
This commit is contained in:
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Tools/media generation: auto-fallback across auth-backed image, music, and video providers by default, and remap fallback size, aspect ratio, resolution, and duration hints to the closest supported option instead of dropping intent on provider switches.
|
||||
- Tools/media generation: report applied fallback geometry and duration settings consistently in tool results, add a shared normalization contract for image/music/video runtimes, and simplify the bundled image-generation-core runtime test to only verify the plugin-sdk re-export seam.
|
||||
- Gateway/sessions: add persisted compaction checkpoints plus Sessions UI branch/restore actions so operators can inspect and recover pre-compaction session state. (#62146) Thanks @scoootscooob.
|
||||
- Providers/Ollama: detect vision capability from the `/api/show` response and set image input on models that support it so Ollama vision models accept image attachments. (#62193) Thanks @BruceMacD.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
@@ -11,7 +11,9 @@ export {
|
||||
fetchOllamaModels,
|
||||
isReasoningModelHeuristic,
|
||||
queryOllamaContextWindow,
|
||||
queryOllamaModelShowInfo,
|
||||
resolveOllamaApiBase,
|
||||
type OllamaModelShowInfo,
|
||||
type OllamaModelWithContext,
|
||||
type OllamaTagModel,
|
||||
type OllamaTagsResponse,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { jsonResponse, requestBodyText, requestUrl } from "../../../src/test-helpers/http.js";
|
||||
import {
|
||||
buildOllamaModelDefinition,
|
||||
enrichOllamaModelsWithContext,
|
||||
resolveOllamaApiBase,
|
||||
type OllamaTagModel,
|
||||
@@ -16,7 +17,7 @@ describe("ollama provider models", () => {
|
||||
expect(resolveOllamaApiBase("http://127.0.0.1:11434///")).toBe("http://127.0.0.1:11434");
|
||||
});
|
||||
|
||||
it("enriches discovered models with context windows from /api/show", async () => {
|
||||
it("sets discovered models with context windows from /api/show", async () => {
|
||||
const models: OllamaTagModel[] = [{ name: "llama3:8b" }, { name: "deepseek-r1:14b" }];
|
||||
const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => {
|
||||
const url = requestUrl(input);
|
||||
@@ -34,8 +35,63 @@ describe("ollama provider models", () => {
|
||||
const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models);
|
||||
|
||||
expect(enriched).toEqual([
|
||||
{ name: "llama3:8b", contextWindow: 65536 },
|
||||
{ name: "deepseek-r1:14b", contextWindow: undefined },
|
||||
{ name: "llama3:8b", contextWindow: 65536, capabilities: undefined },
|
||||
{ name: "deepseek-r1:14b", contextWindow: undefined, capabilities: undefined },
|
||||
]);
|
||||
});
|
||||
|
||||
it("sets models with vision capability from /api/show capabilities", async () => {
|
||||
const models: OllamaTagModel[] = [{ name: "kimi-k2.5:cloud" }, { name: "glm-5:cloud" }];
|
||||
const fetchMock = vi.fn(async (input: string | URL | Request, init?: RequestInit) => {
|
||||
const url = requestUrl(input);
|
||||
if (!url.endsWith("/api/show")) {
|
||||
throw new Error(`Unexpected fetch: ${url}`);
|
||||
}
|
||||
const body = JSON.parse(requestBodyText(init?.body)) as { name?: string };
|
||||
if (body.name === "kimi-k2.5:cloud") {
|
||||
return jsonResponse({
|
||||
model_info: { "kimi-k2.context_length": 262144 },
|
||||
capabilities: ["vision", "thinking", "completion", "tools"],
|
||||
});
|
||||
}
|
||||
if (body.name === "glm-5:cloud") {
|
||||
return jsonResponse({
|
||||
model_info: { "glm5.context_length": 202752 },
|
||||
capabilities: ["thinking", "completion", "tools"],
|
||||
});
|
||||
}
|
||||
return jsonResponse({});
|
||||
});
|
||||
vi.stubGlobal("fetch", fetchMock);
|
||||
|
||||
const enriched = await enrichOllamaModelsWithContext("http://127.0.0.1:11434", models);
|
||||
|
||||
expect(enriched).toEqual([
|
||||
{
|
||||
name: "kimi-k2.5:cloud",
|
||||
contextWindow: 262144,
|
||||
capabilities: ["vision", "thinking", "completion", "tools"],
|
||||
},
|
||||
{
|
||||
name: "glm-5:cloud",
|
||||
contextWindow: 202752,
|
||||
capabilities: ["thinking", "completion", "tools"],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("buildOllamaModelDefinition sets input to text+image when vision capability is present", () => {
|
||||
const visionModel = buildOllamaModelDefinition("kimi-k2.5:cloud", 262144, [
|
||||
"vision",
|
||||
"completion",
|
||||
"tools",
|
||||
]);
|
||||
expect(visionModel.input).toEqual(["text", "image"]);
|
||||
|
||||
const textModel = buildOllamaModelDefinition("glm-5:cloud", 202752, ["completion", "tools"]);
|
||||
expect(textModel.input).toEqual(["text"]);
|
||||
|
||||
const noCapabilities = buildOllamaModelDefinition("unknown-model", 65536);
|
||||
expect(noCapabilities.input).toEqual(["text"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -25,6 +25,7 @@ export type OllamaTagsResponse = {
|
||||
|
||||
export type OllamaModelWithContext = OllamaTagModel & {
|
||||
contextWindow?: number;
|
||||
capabilities?: string[];
|
||||
};
|
||||
|
||||
const OLLAMA_SHOW_CONCURRENCY = 8;
|
||||
@@ -56,10 +57,15 @@ export function resolveOllamaApiBase(configuredBaseUrl?: string): string {
|
||||
return trimmed.replace(/\/v1$/i, "");
|
||||
}
|
||||
|
||||
export async function queryOllamaContextWindow(
|
||||
export type OllamaModelShowInfo = {
|
||||
contextWindow?: number;
|
||||
capabilities?: string[];
|
||||
};
|
||||
|
||||
export async function queryOllamaModelShowInfo(
|
||||
apiBase: string,
|
||||
modelName: string,
|
||||
): Promise<number | undefined> {
|
||||
): Promise<OllamaModelShowInfo> {
|
||||
try {
|
||||
const { response, release } = await fetchWithSsrFGuard({
|
||||
url: `${apiBase}/api/show`,
|
||||
@@ -74,33 +80,51 @@ export async function queryOllamaContextWindow(
|
||||
});
|
||||
try {
|
||||
if (!response.ok) {
|
||||
return undefined;
|
||||
return {};
|
||||
}
|
||||
const data = (await response.json()) as { model_info?: Record<string, unknown> };
|
||||
if (!data.model_info) {
|
||||
return undefined;
|
||||
}
|
||||
for (const [key, value] of Object.entries(data.model_info)) {
|
||||
if (
|
||||
key.endsWith(".context_length") &&
|
||||
typeof value === "number" &&
|
||||
Number.isFinite(value)
|
||||
) {
|
||||
const contextWindow = Math.floor(value);
|
||||
if (contextWindow > 0) {
|
||||
return contextWindow;
|
||||
const data = (await response.json()) as {
|
||||
model_info?: Record<string, unknown>;
|
||||
capabilities?: unknown;
|
||||
};
|
||||
|
||||
let contextWindow: number | undefined;
|
||||
if (data.model_info) {
|
||||
for (const [key, value] of Object.entries(data.model_info)) {
|
||||
if (
|
||||
key.endsWith(".context_length") &&
|
||||
typeof value === "number" &&
|
||||
Number.isFinite(value)
|
||||
) {
|
||||
const ctx = Math.floor(value);
|
||||
if (ctx > 0) {
|
||||
contextWindow = ctx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
|
||||
const capabilities = Array.isArray(data.capabilities)
|
||||
? (data.capabilities as unknown[]).filter((c): c is string => typeof c === "string")
|
||||
: undefined;
|
||||
|
||||
return { contextWindow, capabilities };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
} catch {
|
||||
return undefined;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated Use queryOllamaModelShowInfo instead. */
|
||||
export async function queryOllamaContextWindow(
|
||||
apiBase: string,
|
||||
modelName: string,
|
||||
): Promise<number | undefined> {
|
||||
return (await queryOllamaModelShowInfo(apiBase, modelName)).contextWindow;
|
||||
}
|
||||
|
||||
export async function enrichOllamaModelsWithContext(
|
||||
apiBase: string,
|
||||
models: OllamaTagModel[],
|
||||
@@ -111,10 +135,14 @@ export async function enrichOllamaModelsWithContext(
|
||||
for (let index = 0; index < models.length; index += concurrency) {
|
||||
const batch = models.slice(index, index + concurrency);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (model) => ({
|
||||
...model,
|
||||
contextWindow: await queryOllamaContextWindow(apiBase, model.name),
|
||||
})),
|
||||
batch.map(async (model) => {
|
||||
const showInfo = await queryOllamaModelShowInfo(apiBase, model.name);
|
||||
return {
|
||||
...model,
|
||||
contextWindow: showInfo.contextWindow,
|
||||
capabilities: showInfo.capabilities,
|
||||
};
|
||||
}),
|
||||
);
|
||||
enriched.push(...batchResults);
|
||||
}
|
||||
@@ -128,12 +156,15 @@ export function isReasoningModelHeuristic(modelId: string): boolean {
|
||||
export function buildOllamaModelDefinition(
|
||||
modelId: string,
|
||||
contextWindow?: number,
|
||||
capabilities?: string[],
|
||||
): ModelDefinitionConfig {
|
||||
const hasVision = capabilities?.includes("vision") ?? false;
|
||||
const input: ("text" | "image")[] = hasVision ? ["text", "image"] : ["text"];
|
||||
return {
|
||||
id: modelId,
|
||||
name: modelId,
|
||||
reasoning: isReasoningModelHeuristic(modelId),
|
||||
input: ["text"],
|
||||
input,
|
||||
cost: OLLAMA_DEFAULT_COST,
|
||||
contextWindow: contextWindow ?? OLLAMA_DEFAULT_CONTEXT_WINDOW,
|
||||
maxTokens: OLLAMA_DEFAULT_MAX_TOKENS,
|
||||
|
||||
@@ -204,7 +204,8 @@ describe("ollama setup", () => {
|
||||
isRemote: false,
|
||||
openUrl: vi.fn(async () => undefined),
|
||||
});
|
||||
const modelIds = result.config.models?.providers?.ollama?.models?.map((m) => m.id);
|
||||
const models = result.config.models?.providers?.ollama?.models;
|
||||
const modelIds = models?.map((m) => m.id);
|
||||
|
||||
expect(modelIds).toEqual([
|
||||
"kimi-k2.5:cloud",
|
||||
@@ -214,6 +215,10 @@ describe("ollama setup", () => {
|
||||
"glm-4.7-flash",
|
||||
"deepseek-r1:14b",
|
||||
]);
|
||||
expect(models?.find((model) => model.id === "kimi-k2.5:cloud")?.input).toEqual([
|
||||
"text",
|
||||
"image",
|
||||
]);
|
||||
});
|
||||
|
||||
it("uses /api/show context windows when building Ollama model configs", async () => {
|
||||
|
||||
@@ -245,9 +245,14 @@ function buildOllamaModelsConfig(
|
||||
modelNames: string[],
|
||||
discoveredModelsByName?: Map<string, OllamaModelWithContext>,
|
||||
) {
|
||||
return modelNames.map((name) =>
|
||||
buildOllamaModelDefinition(name, discoveredModelsByName?.get(name)?.contextWindow),
|
||||
);
|
||||
return modelNames.map((name) => {
|
||||
const discovered = discoveredModelsByName?.get(name);
|
||||
// Suggested cloud models may be injected before `/api/tags` exposes them,
|
||||
// so keep Kimi vision-capable during setup even without discovered metadata.
|
||||
const capabilities =
|
||||
discovered?.capabilities ?? (name === "kimi-k2.5:cloud" ? ["vision"] : undefined);
|
||||
return buildOllamaModelDefinition(name, discovered?.contextWindow, capabilities);
|
||||
});
|
||||
}
|
||||
|
||||
function applyOllamaProviderConfig(
|
||||
@@ -299,7 +304,9 @@ export async function buildOllamaProvider(
|
||||
return {
|
||||
baseUrl: apiBase,
|
||||
api: "ollama",
|
||||
models: discovered.map((model) => buildOllamaModelDefinition(model.name, model.contextWindow)),
|
||||
models: discovered.map((model) =>
|
||||
buildOllamaModelDefinition(model.name, model.contextWindow, model.capabilities),
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user