fix: resolve providerless image model refs

This commit is contained in:
Peter Steinberger
2026-04-28 10:18:00 +01:00
parent 5741e40c14
commit fac116cfa4
10 changed files with 241 additions and 7 deletions

View File

@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
- Providers/DeepSeek: backfill DeepSeek V4 `reasoning_content` on plain assistant replay messages as well as tool-call turns, so thinking sessions with prior tool use no longer fail follow-up requests with missing reasoning content. Fixes #73417; refs #71372. Thanks @34262315716 and @Bartok9.
- Auto-reply: preserve voice-note media from silent turns while continuing to suppress text and non-voice media, so `NO_REPLY` TTS replies still deliver the requested audio bubble. (#73406) Thanks @zqchris.
- Channels/Mattermost: stop enqueueing regular inbound posts as system events, so Mattermost user messages reach the model only as user-role inbound-envelope content instead of also appearing as `System: Mattermost message...` directives. Fixes #71795. Thanks @juan-flores077.
- Agents/media: qualify bare `agents.defaults.imageModel` and `pdfModel` refs from unique configured image-capable providers, so Ollama vision models such as `moondream` and `qwen2.5vl:7b` do not fall through to the default provider. Fixes #38816; supersedes #73396. Thanks @alainasclaw and @vincentkoc.
- Agents/Anthropic: send implicit Anthropic beta headers only to direct public Anthropic endpoints, including OAuth, so custom Anthropic-compatible providers no longer mis-handle unsupported beta flags unless explicitly configured. Refs #73346. Thanks @byBrodowski.
- Skills: require explicit `skills.entries.coding-agent.enabled` before exposing the bundled coding-agent skill, so installs with Codex on PATH but no OpenAI auth do not silently offer Codex delegation. Fixes #73358. Thanks @LaFleurAdvertising and @Sanjays2402.
- Agents/subagents: preserve `sessions_yield` as a paused subagent state and ignore its wait text while freezing completion output, so parent sessions wait for the final post-compaction answer instead of receiving intermediate progress or `(no output)`. Fixes #73413. Thanks @Ask-sola.

View File

@@ -342,6 +342,7 @@ Time format in system prompt. Default: `auto` (OS preference).
- `imageModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
- Used by the `image` tool path as its vision-model config.
- Also used as fallback routing when the selected/default model cannot accept image input.
- Prefer explicit `provider/model` refs. Bare IDs are accepted for compatibility; if a bare ID uniquely matches a configured image-capable entry in `models.providers.*.models`, OpenClaw qualifies it to that provider. Ambiguous configured matches require an explicit provider prefix.
- `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
- Used by the shared image-generation capability and any future tool/plugin surface that generates images.
- Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, `openai/gpt-image-2` for OpenAI Images, or `openai/gpt-image-1.5` for transparent-background OpenAI PNG/WebP output.

View File

@@ -172,6 +172,7 @@ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven't
</Step>
<Step title="agents.defaults.imageModel">
`agents.defaults.imageModel` primary/fallback refs (image only).
Prefer `provider/model` refs. Bare refs are qualified from configured image-capable provider model entries only when the match is unique.
</Step>
<Step title="Local CLIs (audio only)">
Local CLIs (if installed):

View File

@@ -283,6 +283,8 @@ To make Ollama the default image-understanding model for inbound media, configur
}
```
Prefer the full `ollama/<model>` ref. If the same model is listed under `models.providers.ollama.models` with `input: ["text", "image"]` and no other configured image provider exposes that bare model ID, OpenClaw also normalizes a bare `imageModel` ref such as `qwen2.5vl:7b` to `ollama/qwen2.5vl:7b`. If more than one configured image provider has the same bare ID, use the provider prefix explicitly.
Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn:
```json5

View File

@@ -543,8 +543,8 @@ describe("CLI attempt execution", () => {
await runAgentAttempt({
providerOverride: "anthropic",
originalProvider: "anthropic",
modelOverride: "claude-opus-4-7",
originalProvider: "anthropic",
cfg: {
agents: {
defaults: {

View File

@@ -2,7 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { estimateBase64DecodedBytes } from "../../media/base64.js";
import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
import { findNormalizedProviderValue } from "../model-selection.js";
import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";
@@ -134,6 +134,106 @@ export function coerceImageModelConfig(cfg?: OpenClawConfig): ImageModelConfig {
return coerceToolModelConfig(cfg?.agents?.defaults?.imageModel);
}
function formatConfiguredImageModelRef(provider: string, modelId: string): string {
const slash = modelId.indexOf("/");
if (slash > 0 && normalizeProviderId(modelId.slice(0, slash)) === provider) {
return modelId;
}
return `${provider}/${modelId}`;
}
function modelIdMatchesProviderlessRef(params: {
provider: string;
modelId: string;
ref: string;
}): boolean {
const candidates = new Set([params.modelId]);
const slash = params.modelId.indexOf("/");
if (slash > 0 && normalizeProviderId(params.modelId.slice(0, slash)) === params.provider) {
candidates.add(params.modelId.slice(slash + 1));
}
const normalizedRef = normalizeLowercaseStringOrEmpty(params.ref);
for (const candidate of candidates) {
if (candidate === params.ref || normalizeLowercaseStringOrEmpty(candidate) === normalizedRef) {
return true;
}
}
return false;
}
function findConfiguredImageModelMatches(params: { cfg?: OpenClawConfig; ref: string }): string[] {
const providers = params.cfg?.models?.providers;
if (!providers || typeof providers !== "object") {
return [];
}
const matches = new Set<string>();
for (const [providerKey, providerConfig] of Object.entries(providers)) {
const provider = normalizeProviderId(providerKey);
if (!provider || !Array.isArray(providerConfig?.models)) {
continue;
}
for (const entry of providerConfig.models) {
const modelId = entry?.id?.trim();
if (!modelId || !Array.isArray(entry?.input) || !entry.input.includes("image")) {
continue;
}
if (!modelIdMatchesProviderlessRef({ provider, modelId, ref: params.ref })) {
continue;
}
matches.add(formatConfiguredImageModelRef(provider, modelId));
}
}
return [...matches];
}
function resolveProviderlessConfiguredImageModelRef(params: {
cfg?: OpenClawConfig;
ref: string;
}): string {
const ref = params.ref.trim();
if (!ref || ref.includes("/")) {
return ref;
}
const matches = findConfiguredImageModelMatches({ cfg: params.cfg, ref });
if (matches.length === 0) {
return ref;
}
if (matches.length === 1) {
return matches[0];
}
throw new Error(
`Ambiguous image model "${ref}". Configure a provider-prefixed ref such as ${matches
.map((match) => `"${match}"`)
.join(" or ")}.`,
);
}
export function resolveConfiguredImageModelRefs(params: {
cfg?: OpenClawConfig;
imageModelConfig: ImageModelConfig;
}): ImageModelConfig {
const primary = params.imageModelConfig.primary?.trim();
const fallbacks = params.imageModelConfig.fallbacks
?.map((ref) => resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref }))
.filter((ref) => ref.length > 0);
return {
...(params.imageModelConfig.primary !== undefined
? {
primary: primary
? resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref: primary })
: primary,
}
: {}),
...(fallbacks && fallbacks.length > 0 ? { fallbacks } : {}),
...(params.imageModelConfig.timeoutMs !== undefined
? { timeoutMs: params.imageModelConfig.timeoutMs }
: {}),
};
}
export function resolveProviderVisionModelFromConfig(params: {
cfg?: OpenClawConfig;
provider: string;

View File

@@ -39,13 +39,13 @@ async function withLiveImageWorkspace<T>(
}
describe.skipIf(!LIVE)("image tool Ollama live", () => {
it("describes a local image through the explicit image tool", async () => {
it("describes a local image through a providerless configured Ollama image model", async () => {
process.env.OLLAMA_API_KEY ||= "ollama-local";
await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` },
imageModel: { primary: OLLAMA_IMAGE_MODEL },
},
},
models: {

View File

@@ -920,6 +920,124 @@ describe("image tool implicit imageModel config", () => {
});
});
it("resolves providerless explicit image models from unique configured image providers", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: {
primary: "moondream",
fallbacks: ["qwen2.5vl:7b", "G-2.5-f"],
},
},
},
models: {
providers: {
ollama: {
baseUrl: "http://localhost:11434",
models: [
makeModelDefinition("moondream", ["text", "image"]),
makeModelDefinition("qwen2.5vl:7b", ["text", "image"]),
makeModelDefinition("G-2.5-f", ["text", "image"]),
],
},
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "ollama/moondream",
fallbacks: ["ollama/qwen2.5vl:7b", "ollama/G-2.5-f"],
});
});
});
it("runs providerless explicit image models on the inferred provider", async () => {
await withTempAgentDir(async (agentDir) => {
const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({
text: `ok ${params.model}`,
model: params.model,
}));
installImageUnderstandingProviderStubs({
id: "ollama",
capabilities: ["image"],
describeImage,
});
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "moondream" },
},
},
models: {
providers: {
ollama: {
baseUrl: "http://localhost:11434",
models: [makeModelDefinition("moondream", ["text", "image"])],
},
},
},
};
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "Describe this image in one word.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
});
expect(describeImage).toHaveBeenCalledWith(
expect.objectContaining({ provider: "ollama", model: "moondream" }),
);
expect(result.content).toEqual(
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok moondream" })]),
);
});
});
it("rejects ambiguous providerless explicit image models", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "moondream" },
},
},
models: {
providers: {
ollama: {
baseUrl: "http://localhost:11434",
models: [makeModelDefinition("moondream", ["text", "image"])],
},
lmstudio: {
baseUrl: "http://localhost:1234",
models: [makeModelDefinition("moondream", ["text", "image"])],
},
},
},
};
expect(() => resolveImageModelConfigForTool({ cfg, agentDir })).toThrow(
'Ambiguous image model "moondream"',
);
});
});
it("keeps unmatched providerless explicit image models on the legacy default-provider path", async () => {
await withTempAgentDir(async (agentDir) => {
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "gpt-5.4-mini" },
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "gpt-5.4-mini",
});
});
});
it("keeps image tool available when primary model supports images (for explicit requests)", async () => {
// When the primary model supports images, we still keep the tool available
// because images are auto-injected into prompts. The tool description is

View File

@@ -30,6 +30,7 @@ import {
decodeDataUrl,
hasImageReasoningOnlyResponse,
type ImageModelConfig,
resolveConfiguredImageModelRefs,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
import {
@@ -123,7 +124,10 @@ export function resolveImageModelConfigForTool(params: {
// The tool description is adjusted via modelHasVision to discourage redundant usage.
const explicit = coerceImageModelConfig(params.cfg);
if (hasToolModelConfig(explicit)) {
return explicit;
return resolveConfiguredImageModelRefs({
cfg: params.cfg,
imageModelConfig: explicit,
});
}
const primary = resolveDefaultModelRef(params.cfg);

View File

@@ -7,6 +7,7 @@ import {
import {
coerceImageModelConfig,
type ImageModelConfig,
resolveConfiguredImageModelRefs,
resolveProviderVisionModelFromConfig,
} from "./image-tool.helpers.js";
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
@@ -42,12 +43,18 @@ export function resolvePdfModelConfigForTool(params: {
}): ImageModelConfig | null {
const explicitPdf = coercePdfModelConfig(params.cfg);
if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) {
return explicitPdf;
return resolveConfiguredImageModelRefs({
cfg: params.cfg,
imageModelConfig: explicitPdf,
});
}
const explicitImage = coerceImageModelConfig(params.cfg);
if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) {
return explicitImage;
return resolveConfiguredImageModelRefs({
cfg: params.cfg,
imageModelConfig: explicitImage,
});
}
const primary = resolveDefaultModelRef(params.cfg);