fix: resolve providerless image model refs

2026-05-06 08:40:44 +00:00 · 2026-04-28 10:18:00 +01:00
parent 5741e40c14
commit fac116cfa4
10 changed files with 241 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
 - Providers/DeepSeek: backfill DeepSeek V4 `reasoning_content` on plain assistant replay messages as well as tool-call turns, so thinking sessions with prior tool use no longer fail follow-up requests with missing reasoning content. Fixes #73417; refs #71372. Thanks @34262315716 and @Bartok9.
 - Auto-reply: preserve voice-note media from silent turns while continuing to suppress text and non-voice media, so `NO_REPLY` TTS replies still deliver the requested audio bubble. (#73406) Thanks @zqchris.
 - Channels/Mattermost: stop enqueueing regular inbound posts as system events, so Mattermost user messages reach the model only as user-role inbound-envelope content instead of also appearing as `System: Mattermost message...` directives. Fixes #71795. Thanks @juan-flores077.
+- Agents/media: qualify bare `agents.defaults.imageModel` and `pdfModel` refs from unique configured image-capable providers, so Ollama vision models such as `moondream` and `qwen2.5vl:7b` do not fall through to the default provider. Fixes #38816; supersedes #73396. Thanks @alainasclaw and @vincentkoc.
 - Agents/Anthropic: send implicit Anthropic beta headers only to direct public Anthropic endpoints, including OAuth, so custom Anthropic-compatible providers no longer mis-handle unsupported beta flags unless explicitly configured. Refs #73346. Thanks @byBrodowski.
 - Skills: require explicit `skills.entries.coding-agent.enabled` before exposing the bundled coding-agent skill, so installs with Codex on PATH but no OpenAI auth do not silently offer Codex delegation. Fixes #73358. Thanks @LaFleurAdvertising and @Sanjays2402.
 - Agents/subagents: preserve `sessions_yield` as a paused subagent state and ignore its wait text while freezing completion output, so parent sessions wait for the final post-compaction answer instead of receiving intermediate progress or `(no output)`. Fixes #73413. Thanks @Ask-sola.
--- a/docs/gateway/config-agents.md
+++ b/docs/gateway/config-agents.md
@@ -342,6 +342,7 @@ Time format in system prompt. Default: `auto` (OS preference).
 - `imageModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
  - Used by the `image` tool path as its vision-model config.
  - Also used as fallback routing when the selected/default model cannot accept image input.
+  - Prefer explicit `provider/model` refs. Bare IDs are accepted for compatibility; if a bare ID uniquely matches a configured image-capable entry in `models.providers.*.models`, OpenClaw qualifies it to that provider. Ambiguous configured matches require an explicit provider prefix.
 - `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`).
  - Used by the shared image-generation capability and any future tool/plugin surface that generates images.
  - Typical values: `google/gemini-3.1-flash-image-preview` for native Gemini image generation, `fal/fal-ai/flux/dev` for fal, `openai/gpt-image-2` for OpenAI Images, or `openai/gpt-image-1.5` for transparent-background OpenAI PNG/WebP output.
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -172,6 +172,7 @@ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven't
  </Step>
  <Step title="agents.defaults.imageModel">
    `agents.defaults.imageModel` primary/fallback refs (image only).
+    Prefer `provider/model` refs. Bare refs are qualified from configured image-capable provider model entries only when the match is unique.
  </Step>
  <Step title="Local CLIs (audio only)">
    Local CLIs (if installed):
--- a/docs/providers/ollama.md
+++ b/docs/providers/ollama.md
@@ -283,6 +283,8 @@ To make Ollama the default image-understanding model for inbound media, configur
 }
 ```

+Prefer the full `ollama/<model>` ref. If the same model is listed under `models.providers.ollama.models` with `input: ["text", "image"]` and no other configured image provider exposes that bare model ID, OpenClaw also normalizes a bare `imageModel` ref such as `qwen2.5vl:7b` to `ollama/qwen2.5vl:7b`. If more than one configured image provider has the same bare ID, use the provider prefix explicitly.
+
 Slow local vision models can need a longer image-understanding timeout than cloud models. They can also crash or stop when Ollama tries to allocate the full advertised vision context on constrained hardware. Set a capability timeout, and cap `num_ctx` on the model entry when you only need a normal image-description turn:

 ```json5
--- a/src/agents/command/attempt-execution.cli.test.ts
+++ b/src/agents/command/attempt-execution.cli.test.ts
@@ -543,8 +543,8 @@ describe("CLI attempt execution", () => {

    await runAgentAttempt({
      providerOverride: "anthropic",
-      originalProvider: "anthropic",
      modelOverride: "claude-opus-4-7",
+      originalProvider: "anthropic",
      cfg: {
        agents: {
          defaults: {
--- a/src/agents/tools/image-tool.helpers.ts
+++ b/src/agents/tools/image-tool.helpers.ts
@@ -2,7 +2,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai";
 import type { OpenClawConfig } from "../../config/types.openclaw.js";
 import { estimateBase64DecodedBytes } from "../../media/base64.js";
 import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
-import { findNormalizedProviderValue } from "../model-selection.js";
+import { findNormalizedProviderValue, normalizeProviderId } from "../model-selection.js";
 import { extractAssistantText } from "../pi-embedded-utils.js";
 import { coerceToolModelConfig, type ToolModelConfig } from "./model-config.helpers.js";

@@ -134,6 +134,106 @@ export function coerceImageModelConfig(cfg?: OpenClawConfig): ImageModelConfig {
  return coerceToolModelConfig(cfg?.agents?.defaults?.imageModel);
 }

+function formatConfiguredImageModelRef(provider: string, modelId: string): string {
+  const slash = modelId.indexOf("/");
+  if (slash > 0 && normalizeProviderId(modelId.slice(0, slash)) === provider) {
+    return modelId;
+  }
+  return `${provider}/${modelId}`;
+}
+
+function modelIdMatchesProviderlessRef(params: {
+  provider: string;
+  modelId: string;
+  ref: string;
+}): boolean {
+  const candidates = new Set([params.modelId]);
+  const slash = params.modelId.indexOf("/");
+  if (slash > 0 && normalizeProviderId(params.modelId.slice(0, slash)) === params.provider) {
+    candidates.add(params.modelId.slice(slash + 1));
+  }
+  const normalizedRef = normalizeLowercaseStringOrEmpty(params.ref);
+  for (const candidate of candidates) {
+    if (candidate === params.ref || normalizeLowercaseStringOrEmpty(candidate) === normalizedRef) {
+      return true;
+    }
+  }
+  return false;
+}
+
+function findConfiguredImageModelMatches(params: { cfg?: OpenClawConfig; ref: string }): string[] {
+  const providers = params.cfg?.models?.providers;
+  if (!providers || typeof providers !== "object") {
+    return [];
+  }
+
+  const matches = new Set<string>();
+  for (const [providerKey, providerConfig] of Object.entries(providers)) {
+    const provider = normalizeProviderId(providerKey);
+    if (!provider || !Array.isArray(providerConfig?.models)) {
+      continue;
+    }
+    for (const entry of providerConfig.models) {
+      const modelId = entry?.id?.trim();
+      if (!modelId || !Array.isArray(entry?.input) || !entry.input.includes("image")) {
+        continue;
+      }
+      if (!modelIdMatchesProviderlessRef({ provider, modelId, ref: params.ref })) {
+        continue;
+      }
+      matches.add(formatConfiguredImageModelRef(provider, modelId));
+    }
+  }
+  return [...matches];
+}
+
+function resolveProviderlessConfiguredImageModelRef(params: {
+  cfg?: OpenClawConfig;
+  ref: string;
+}): string {
+  const ref = params.ref.trim();
+  if (!ref || ref.includes("/")) {
+    return ref;
+  }
+
+  const matches = findConfiguredImageModelMatches({ cfg: params.cfg, ref });
+  if (matches.length === 0) {
+    return ref;
+  }
+  if (matches.length === 1) {
+    return matches[0];
+  }
+  throw new Error(
+    `Ambiguous image model "${ref}". Configure a provider-prefixed ref such as ${matches
+      .map((match) => `"${match}"`)
+      .join(" or ")}.`,
+  );
+}
+
+export function resolveConfiguredImageModelRefs(params: {
+  cfg?: OpenClawConfig;
+  imageModelConfig: ImageModelConfig;
+}): ImageModelConfig {
+  const primary = params.imageModelConfig.primary?.trim();
+  const fallbacks = params.imageModelConfig.fallbacks
+    ?.map((ref) => resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref }))
+    .filter((ref) => ref.length > 0);
+
+  return {
+    ...(params.imageModelConfig.primary !== undefined
+      ? {
+          primary: primary
+            ? resolveProviderlessConfiguredImageModelRef({ cfg: params.cfg, ref: primary })
+            : primary,
+        }
+      : {}),
+    ...(fallbacks && fallbacks.length > 0 ? { fallbacks } : {}),
+    ...(params.imageModelConfig.timeoutMs !== undefined
+      ? { timeoutMs: params.imageModelConfig.timeoutMs }
+      : {}),
+  };
+}
+
 export function resolveProviderVisionModelFromConfig(params: {
  cfg?: OpenClawConfig;
  provider: string;
--- a/src/agents/tools/image-tool.ollama.live.test.ts
+++ b/src/agents/tools/image-tool.ollama.live.test.ts
@@ -39,13 +39,13 @@ async function withLiveImageWorkspace<T>(
 }

 describe.skipIf(!LIVE)("image tool Ollama live", () => {
-  it("describes a local image through the explicit image tool", async () => {
+  it("describes a local image through a providerless configured Ollama image model", async () => {
    process.env.OLLAMA_API_KEY ||= "ollama-local";
    await withLiveImageWorkspace(async ({ agentDir, workspaceDir, imagePath }) => {
      const cfg: OpenClawConfig = {
        agents: {
          defaults: {
-            imageModel: { primary: `ollama/${OLLAMA_IMAGE_MODEL}` },
+            imageModel: { primary: OLLAMA_IMAGE_MODEL },
          },
        },
        models: {
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -920,6 +920,124 @@ describe("image tool implicit imageModel config", () => {
    });
  });

+  it("resolves providerless explicit image models from unique configured image providers", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const cfg: OpenClawConfig = {
+        agents: {
+          defaults: {
+            imageModel: {
+              primary: "moondream",
+              fallbacks: ["qwen2.5vl:7b", "G-2.5-f"],
+            },
+          },
+        },
+        models: {
+          providers: {
+            ollama: {
+              baseUrl: "http://localhost:11434",
+              models: [
+                makeModelDefinition("moondream", ["text", "image"]),
+                makeModelDefinition("qwen2.5vl:7b", ["text", "image"]),
+                makeModelDefinition("G-2.5-f", ["text", "image"]),
+              ],
+            },
+          },
+        },
+      };
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
+        primary: "ollama/moondream",
+        fallbacks: ["ollama/qwen2.5vl:7b", "ollama/G-2.5-f"],
+      });
+    });
+  });
+
+  it("runs providerless explicit image models on the inferred provider", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const describeImage = vi.fn(async (params: ImageDescriptionRequest) => ({
+        text: `ok ${params.model}`,
+        model: params.model,
+      }));
+      installImageUnderstandingProviderStubs({
+        id: "ollama",
+        capabilities: ["image"],
+        describeImage,
+      });
+      const cfg: OpenClawConfig = {
+        agents: {
+          defaults: {
+            imageModel: { primary: "moondream" },
+          },
+        },
+        models: {
+          providers: {
+            ollama: {
+              baseUrl: "http://localhost:11434",
+              models: [makeModelDefinition("moondream", ["text", "image"])],
+            },
+          },
+        },
+      };
+
+      const tool = requireImageTool(createImageTool({ config: cfg, agentDir }));
+      const result = await tool.execute("t1", {
+        prompt: "Describe this image in one word.",
+        image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
+      });
+
+      expect(describeImage).toHaveBeenCalledWith(
+        expect.objectContaining({ provider: "ollama", model: "moondream" }),
+      );
+      expect(result.content).toEqual(
+        expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok moondream" })]),
+      );
+    });
+  });
+
+  it("rejects ambiguous providerless explicit image models", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const cfg: OpenClawConfig = {
+        agents: {
+          defaults: {
+            imageModel: { primary: "moondream" },
+          },
+        },
+        models: {
+          providers: {
+            ollama: {
+              baseUrl: "http://localhost:11434",
+              models: [makeModelDefinition("moondream", ["text", "image"])],
+            },
+            lmstudio: {
+              baseUrl: "http://localhost:1234",
+              models: [makeModelDefinition("moondream", ["text", "image"])],
+            },
+          },
+        },
+      };
+
+      expect(() => resolveImageModelConfigForTool({ cfg, agentDir })).toThrow(
+        'Ambiguous image model "moondream"',
+      );
+    });
+  });
+
+  it("keeps unmatched providerless explicit image models on the legacy default-provider path", async () => {
+    await withTempAgentDir(async (agentDir) => {
+      const cfg: OpenClawConfig = {
+        agents: {
+          defaults: {
+            imageModel: { primary: "gpt-5.4-mini" },
+          },
+        },
+      };
+
+      expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
+        primary: "gpt-5.4-mini",
+      });
+    });
+  });
+
  it("keeps image tool available when primary model supports images (for explicit requests)", async () => {
    // When the primary model supports images, we still keep the tool available
    // because images are auto-injected into prompts. The tool description is
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -30,6 +30,7 @@ import {
  decodeDataUrl,
  hasImageReasoningOnlyResponse,
  type ImageModelConfig,
+  resolveConfiguredImageModelRefs,
  resolveProviderVisionModelFromConfig,
 } from "./image-tool.helpers.js";
 import {
@@ -123,7 +124,10 @@ export function resolveImageModelConfigForTool(params: {
  // The tool description is adjusted via modelHasVision to discourage redundant usage.
  const explicit = coerceImageModelConfig(params.cfg);
  if (hasToolModelConfig(explicit)) {
-    return explicit;
+    return resolveConfiguredImageModelRefs({
+      cfg: params.cfg,
+      imageModelConfig: explicit,
+    });
  }

  const primary = resolveDefaultModelRef(params.cfg);
--- a/src/agents/tools/pdf-tool.model-config.ts
+++ b/src/agents/tools/pdf-tool.model-config.ts
@@ -7,6 +7,7 @@ import {
 import {
  coerceImageModelConfig,
  type ImageModelConfig,
+  resolveConfiguredImageModelRefs,
  resolveProviderVisionModelFromConfig,
 } from "./image-tool.helpers.js";
 import { hasAuthForProvider, resolveDefaultModelRef } from "./model-config.helpers.js";
@@ -42,12 +43,18 @@ export function resolvePdfModelConfigForTool(params: {
 }): ImageModelConfig | null {
  const explicitPdf = coercePdfModelConfig(params.cfg);
  if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) {
-    return explicitPdf;
+    return resolveConfiguredImageModelRefs({
+      cfg: params.cfg,
+      imageModelConfig: explicitPdf,
+    });
  }

  const explicitImage = coerceImageModelConfig(params.cfg);
  if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) {
-    return explicitImage;
+    return resolveConfiguredImageModelRefs({
+      cfg: params.cfg,
+      imageModelConfig: explicitImage,
+    });
  }

  const primary = resolveDefaultModelRef(params.cfg);