mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:40:44 +00:00
fix: resolve providerless image model refs
This commit is contained in:
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Providers/DeepSeek: backfill DeepSeek V4 `reasoning_content` on plain assistant replay messages as well as tool-call turns, so thinking sessions with prior tool use no longer fail follow-up requests with missing reasoning content. Fixes #73417; refs #71372. Thanks @34262315716 and @Bartok9.
|
||||
- Auto-reply: preserve voice-note media from silent turns while continuing to suppress text and non-voice media, so `NO_REPLY` TTS replies still deliver the requested audio bubble. (#73406) Thanks @zqchris.
|
||||
- Channels/Mattermost: stop enqueueing regular inbound posts as system events, so Mattermost user messages reach the model only as user-role inbound-envelope content instead of also appearing as `System: Mattermost message...` directives. Fixes #71795. Thanks @juan-flores077.
|
||||
- Agents/media: qualify bare `agents.defaults.imageModel` and `pdfModel` refs from unique configured image-capable providers, so Ollama vision models such as `moondream` and `qwen2.5vl:7b` do not fall through to the default provider. Fixes #38816; supersedes #73396. Thanks @alainasclaw and @vincentkoc.
|
||||
- Agents/Anthropic: send implicit Anthropic beta headers only to direct public Anthropic endpoints, including OAuth, so custom Anthropic-compatible providers no longer mis-handle unsupported beta flags unless explicitly configured. Refs #73346. Thanks @byBrodowski.
|
||||
- Skills: require explicit `skills.entries.coding-agent.enabled` before exposing the bundled coding-agent skill, so installs with Codex on PATH but no OpenAI auth do not silently offer Codex delegation. Fixes #73358. Thanks @LaFleurAdvertising and @Sanjays2402.
|
||||
- Agents/subagents: preserve `sessions_yield` as a paused subagent state and ignore its wait text while freezing completion output, so parent sessions wait for the final post-compaction answer instead of receiving intermediate progress or `(no output)`. Fixes #73413. Thanks @Ask-sola.
|
||||
|
||||
@@ -342,6 +342,7 @@ Time format in system prompt. Default: `auto` (OS preference).
|
||||
- `imageModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
|
||||
- Used by the `image` tool path as its vision-model config.
|
||||
- Also used as fallback routing when the selected/default model cannot accept image input.
|
||||
- Prefer explicit `provider/model` refs. Bare IDs are accepted for compatibility; if a bare ID uniquely matches a configured image-capable entry in `models.providers.*.models`, OpenClaw qualifies it to that provider. Ambiguous configured matches require an explicit provider prefix.
|
||||
- `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
|
||||
- Used by the shared image-generation capability and any future tool/plugin surface that generates images.
|
||||
- Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, `openai/gpt-image-2` for OpenAI Images, or `openai/gpt-image-1.5` for transparent-background OpenAI PNG/WebP output.
|
||||
|
||||
@@ -172,6 +172,7 @@ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven't
|
||||
</Step>
|
||||
<Step title="agents.defaults.imageModel">
|
||||
`agents.defaults.imageModel` primary/fallback refs (image only).
|
||||
Prefer `provider/model` refs. Bare refs are qualified from configured image-capable provider model entries only when the match is unique.
|
||||
</Step>
|
||||
<Step title="Local CLIs (audio only)">
|
||||
Local CLIs (if installed):
|
||||
|
||||
@@ -283,6 +283,8 @@ To make Ollama the default image-understanding model for inbound media, configur
|
||||
}
|
||||
```
|
||||
|
||||
Prefer the full `ollama/<model>` ref. If the same model is listed under `models.providers.ollama.models` with `input: ["text", "image"]` and no other configured image provider exposes that bare model ID, OpenClaw also normalizes a bare `imageModel` ref such as `qwen2.5vl:7b` to `ollama/qwen2.5vl:7b`. If more than one configured image provider has the same bare ID, use the provider prefix explicitly.
|
||||
|
||||
Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn:
|
||||
|
||||
```json5
|
||||
|
||||
@@ -543,8 +543,8 @@ describe("CLI attempt execution", () => {
|
||||
|
||||
await runAgentAttempt({
|
||||
providerOverride: "anthropic",
|
||||
originalProvider: "anthropic",
|
||||
modelOverride: "claude-opus-4-7",
|
||||
originalProvider: "anthropic",
|
||||
cfg: {
|
||||
agents: {
|
||||
defaults: {
|
||||
|
||||
@@ -2,7 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
|
||||
import type { OpenClawConfig } from "../../config/types.openclaw.js";
|
||||
import { estimateBase64DecodedBytes } from "../../media/base64.js";
|
||||
import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
|
||||
import { findNormalizedProviderValue } from "../model-selection.js";
|
||||
import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
|
||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||
import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";
|
||||
|
||||
@@ -134,6 +134,106 @@ export function coerceImageModelConfig(cfg?: OpenClawConfig): ImageModelConfig {
|
||||
return coerceToolModelConfig(cfg?.agents?.defaults?.imageModel);
|
||||
}
|
||||
|
||||
function formatConfiguredImageModelRef(provider: string, modelId: string): string {
|
||||
const slash = modelId.indexOf("/");
|
||||
if (slash > 0 && normalizeProviderId(modelId.slice(0, slash)) === provider) {
|
||||
return modelId;
|
||||
}
|
||||
return `${provider}/${modelId}`;
|
||||
}
|
||||
|
||||
function modelIdMatchesProviderlessRef(params: {
|
||||
provider: string;
|
||||
modelId: string;
|
||||
ref: string;
|
||||
}): boolean {
|
||||
const candidates = new Set([params.modelId]);
|
||||
const slash = params.modelId.indexOf("/");
|
||||
if (slash > 0 && normalizeProviderId(params.modelId.slice(0, slash)) === params.provider) {
|
||||
candidates.add(params.modelId.slice(slash + 1));
|
||||
}
|
||||
const normalizedRef = normalizeLowercaseStringOrEmpty(params.ref);
|
||||
for (const candidate of candidates) {
|
||||
if (candidate === params.ref || normalizeLowercaseStringOrEmpty(candidate) === normalizedRef) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function findConfiguredImageModelMatches(params: { cfg?: OpenClawConfig; ref: string }): string[] {
|
||||
const providers = params.cfg?.models?.providers;
|
||||
if (!providers || typeof providers !== "object") {
|
||||
return [];
|
||||
}
|
||||
|
||||
const matches = new Set<string>();
|
||||
for (const [providerKey, providerConfig] of Object.entries(providers)) {
|
||||
const provider = normalizeProviderId(providerKey);
|
||||
if (!provider || !Array.isArray(providerConfig?.models)) {
|
||||
continue;
|
||||
}
|
||||
for (const entry of providerConfig.models) {
|
||||
const modelId = entry?.id?.trim();
|
||||
if (!modelId || !Array.isArray(entry?.input) || !entry.input.includes("image")) {
|
||||
continue;
|
||||
}
|
||||
if (!modelIdMatchesProviderlessRef({ provider, modelId, ref: params.ref })) {
|
||||
continue;
|
||||
}
|
||||
matches.add(formatConfiguredImageModelRef(provider, modelId));
|
||||
}
|
||||
}
|
||||
return [...matches];
|
||||
}
|
||||
|
||||
function resolveProviderlessConfiguredImageModelRef(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
ref: string;
|
||||
}): string {
|
||||
const ref = params.ref.trim();
|
||||
if (!ref || ref.includes("/")) {
|
||||
return ref;
|
||||
}
|
||||
|
||||
const matches = findConfiguredImageModelMatches({ cfg: params.cfg, ref });
|
||||
if (matches.length === 0) {
|
||||
return ref;
|
||||
}
|
||||
if (matches.length === 1) {
|
||||
return matches[0];
|
||||
}
|
||||
throw new Error(
|
||||
`Ambiguous image model "${ref}". Configure a provider-prefixed ref such as ${matches
|
||||
.map((match) => `"${match}"`)
|
||||
.join(" or ")}.`,
|
||||
);
|
||||
}
|
||||
|
||||
export function resolveConfiguredImageModelRefs(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
imageModelConfig: ImageModelConfig;
|
||||
}): ImageModelConfig {
|
||||
const primary = params.imageModelConfig.primary?.trim();
|
||||
const fallbacks = params.imageModelConfig.fallbacks
|
||||
?.map((ref) => resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref }))
|
||||
.filter((ref) => ref.length > 0);
|
||||
|
||||
return {
|
||||
...(params.imageModelConfig.primary !== undefined
|
||||
? {
|
||||
primary: primary
|
||||
? resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref: primary })
|
||||
: primary,
|
||||
}
|
||||
: {}),
|
||||
...(fallbacks && fallbacks.length > 0 ? { fallbacks } : {}),
|
||||
...(params.imageModelConfig.timeoutMs !== undefined
|
||||
? { timeoutMs: params.imageModelConfig.timeoutMs }
|
||||
: {}),
|
||||
};
|
||||
}
|
||||
|
||||
export function resolveProviderVisionModelFromConfig(params: {
|
||||
cfg?: OpenClawConfig;
|
||||
provider: string;
|
||||
|
||||
@@ -39,13 +39,13 @@ async function withLiveImageWorkspace<T>(
|
||||
}
|
||||
|
||||
describe.skipIf(!LIVE)("image tool Ollama live", () => {
|
||||
it("describes a local image through the explicit image tool", async () => {
|
||||
it("describes a local image through a providerless configured Ollama image model", async () => {
|
||||
process.env.OLLAMA_API_KEY ||= "ollama-local";
|
||||
await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => {
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` },
|
||||
imageModel: { primary: OLLAMA_IMAGE_MODEL },
|
||||
},
|
||||
},
|
||||
models: {
|
||||
|
||||
@@ -920,6 +920,124 @@ describe("image tool implicit imageModel config", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("resolves providerless explicit image models from unique configured image providers", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: {
|
||||
primary: "moondream",
|
||||
fallbacks: ["qwen2.5vl:7b", "G-2.5-f"],
|
||||
},
|
||||
},
|
||||
},
|
||||
models: {
|
||||
providers: {
|
||||
ollama: {
|
||||
baseUrl: "http://localhost:11434",
|
||||
models: [
|
||||
makeModelDefinition("moondream", ["text", "image"]),
|
||||
makeModelDefinition("qwen2.5vl:7b", ["text", "image"]),
|
||||
makeModelDefinition("G-2.5-f", ["text", "image"]),
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "ollama/moondream",
|
||||
fallbacks: ["ollama/qwen2.5vl:7b", "ollama/G-2.5-f"],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("runs providerless explicit image models on the inferred provider", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({
|
||||
text: `ok ${params.model}`,
|
||||
model: params.model,
|
||||
}));
|
||||
installImageUnderstandingProviderStubs({
|
||||
id: "ollama",
|
||||
capabilities: ["image"],
|
||||
describeImage,
|
||||
});
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: { primary: "moondream" },
|
||||
},
|
||||
},
|
||||
models: {
|
||||
providers: {
|
||||
ollama: {
|
||||
baseUrl: "http://localhost:11434",
|
||||
models: [makeModelDefinition("moondream", ["text", "image"])],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
|
||||
const result = await tool.execute("t1", {
|
||||
prompt: "Describe this image in one word.",
|
||||
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
|
||||
});
|
||||
|
||||
expect(describeImage).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ provider: "ollama", model: "moondream" }),
|
||||
);
|
||||
expect(result.content).toEqual(
|
||||
expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok moondream" })]),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("rejects ambiguous providerless explicit image models", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: { primary: "moondream" },
|
||||
},
|
||||
},
|
||||
models: {
|
||||
providers: {
|
||||
ollama: {
|
||||
baseUrl: "http://localhost:11434",
|
||||
models: [makeModelDefinition("moondream", ["text", "image"])],
|
||||
},
|
||||
lmstudio: {
|
||||
baseUrl: "http://localhost:1234",
|
||||
models: [makeModelDefinition("moondream", ["text", "image"])],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(() => resolveImageModelConfigForTool({ cfg, agentDir })).toThrow(
|
||||
'Ambiguous image model "moondream"',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps unmatched providerless explicit image models on the legacy default-provider path", async () => {
|
||||
await withTempAgentDir(async (agentDir) => {
|
||||
const cfg: OpenClawConfig = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: { primary: "gpt-5.4-mini" },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "gpt-5.4-mini",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps image tool available when primary model supports images (for explicit requests)", async () => {
|
||||
// When the primary model supports images, we still keep the tool available
|
||||
// because images are auto-injected into prompts. The tool description is
|
||||
|
||||
@@ -30,6 +30,7 @@ import {
|
||||
decodeDataUrl,
|
||||
hasImageReasoningOnlyResponse,
|
||||
type ImageModelConfig,
|
||||
resolveConfiguredImageModelRefs,
|
||||
resolveProviderVisionModelFromConfig,
|
||||
} from "./image-tool.helpers.js";
|
||||
import {
|
||||
@@ -123,7 +124,10 @@ export function resolveImageModelConfigForTool(params: {
|
||||
// The tool description is adjusted via modelHasVision to discourage redundant usage.
|
||||
const explicit = coerceImageModelConfig(params.cfg);
|
||||
if (hasToolModelConfig(explicit)) {
|
||||
return explicit;
|
||||
return resolveConfiguredImageModelRefs({
|
||||
cfg: params.cfg,
|
||||
imageModelConfig: explicit,
|
||||
});
|
||||
}
|
||||
|
||||
const primary = resolveDefaultModelRef(params.cfg);
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
import {
|
||||
coerceImageModelConfig,
|
||||
type ImageModelConfig,
|
||||
resolveConfiguredImageModelRefs,
|
||||
resolveProviderVisionModelFromConfig,
|
||||
} from "./image-tool.helpers.js";
|
||||
import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
|
||||
@@ -42,12 +43,18 @@ export function resolvePdfModelConfigForTool(params: {
|
||||
}): ImageModelConfig | null {
|
||||
const explicitPdf = coercePdfModelConfig(params.cfg);
|
||||
if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) {
|
||||
return explicitPdf;
|
||||
return resolveConfiguredImageModelRefs({
|
||||
cfg: params.cfg,
|
||||
imageModelConfig: explicitPdf,
|
||||
});
|
||||
}
|
||||
|
||||
const explicitImage = coerceImageModelConfig(params.cfg);
|
||||
if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) {
|
||||
return explicitImage;
|
||||
return resolveConfiguredImageModelRefs({
|
||||
cfg: params.cfg,
|
||||
imageModelConfig: explicitImage,
|
||||
});
|
||||
}
|
||||
|
||||
const primary = resolveDefaultModelRef(params.cfg);
|
||||
|
||||
Reference in New Issue
Block a user