fix: honor explicit media image model routing

This commit is contained in:
Peter Steinberger
2026-04-24 02:21:25 +01:00
parent c0a7b6a510
commit d268c850e6
3 changed files with 183 additions and 13 deletions

View File

@@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
- Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
- Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.
- Models/CLI: keep `openclaw models list` read-only while still showing eligible configured-provider rows, so listing models no longer rewrites per-agent `models.json`. (#70847) Thanks @shakkernerd.

View File

@@ -85,6 +85,15 @@ function resolveConfiguredImageModelId(params: {
cfg: OpenClawConfig;
providerId: string;
}): string | undefined {
const configured = resolveConfiguredImageModel(params);
const id = configured?.id?.trim();
return id || undefined;
}
function resolveConfiguredImageModel(params: {
cfg: OpenClawConfig;
providerId: string;
}): { id?: string; input?: string[] } | undefined {
const providerCfg = findNormalizedProviderValue(
params.cfg.models?.providers,
params.providerId,
@@ -96,12 +105,10 @@ function resolveConfiguredImageModelId(params: {
}>;
}
| undefined;
const configured = providerCfg?.models?.find((entry) => {
return providerCfg?.models?.find((entry) => {
const id = entry?.id?.trim();
return Boolean(id) && entry?.input?.includes("image");
});
const id = configured?.id?.trim();
return id || undefined;
}
function resolveCatalogImageModelId(params: {
@@ -119,6 +126,23 @@ function resolveCatalogImageModelId(params: {
return normalizeOptionalString((autoEntry ?? matches[0])?.id);
}
async function explicitImageModelVisionStatus(params: {
cfg: OpenClawConfig;
providerId: string;
model: string;
}): Promise<"supported" | "unsupported" | "unknown"> {
const configured = resolveConfiguredImageModel(params);
if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
return "supported";
}
const catalog = await loadModelCatalog({ config: params.cfg });
const entry = findModelInCatalog(catalog, params.providerId, params.model);
if (!entry) {
return "unknown";
}
return modelSupportsVision(entry) ? "supported" : "unsupported";
}
async function resolveAutoImageModelId(params: {
cfg: OpenClawConfig;
providerId: string;
@@ -126,7 +150,14 @@ async function resolveAutoImageModelId(params: {
}): Promise<string | undefined> {
const explicit = normalizeOptionalString(params.explicitModel);
if (explicit) {
return explicit;
const explicitStatus = await explicitImageModelVisionStatus({
cfg: params.cfg,
providerId: params.providerId,
model: explicit,
});
if (explicitStatus !== "unsupported") {
return explicit;
}
}
const configuredModel = resolveConfiguredImageModelId(params);
if (configuredModel) {
@@ -498,6 +529,16 @@ function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstan
return entries;
}
function hasExplicitImageUnderstandingConfig(params: {
cfg: OpenClawConfig;
config?: MediaUnderstandingConfig;
}): boolean {
return (
(params.config?.models?.length ?? 0) > 0 ||
resolveImageModelFromAgentDefaults(params.cfg).length > 0
);
}
async function resolveAutoEntries(params: {
cfg: OpenClawConfig;
agentDir?: string;
@@ -505,6 +546,12 @@ async function resolveAutoEntries(params: {
capability: MediaUnderstandingCapability;
activeModel?: ActiveMediaModel;
}): Promise<MediaUnderstandingModelConfig[]> {
if (params.capability === "image") {
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
if (imageModelEntries.length > 0) {
return imageModelEntries;
}
}
const activeEntry = await resolveActiveModelEntry(params);
if (activeEntry) {
return [activeEntry];
@@ -519,12 +566,6 @@ async function resolveAutoEntries(params: {
return [localAudio];
}
}
if (params.capability === "image") {
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
if (imageModelEntries.length > 0) {
return imageModelEntries;
}
}
const gemini = await resolveGeminiCliEntry(params.capability);
if (gemini) {
return [gemini];
@@ -553,6 +594,12 @@ export async function resolveAutoImageModel(params: {
}
return { provider, model };
};
const configuredImageModel = resolveImageModelFromAgentDefaults(params.cfg)
.map((entry) => toActive(entry))
.find((entry): entry is ActiveMediaModel => entry !== null);
if (configuredImageModel) {
return configuredImageModel;
}
const activeEntry = await resolveActiveModelEntry({
cfg: params.cfg,
agentDir: params.agentDir,
@@ -772,7 +819,11 @@ export async function runCapability(params: {
// Skip image understanding when the primary model supports vision natively.
// The image will be injected directly into the model context instead.
const activeProvider = params.activeModel?.provider?.trim();
if (capability === "image" && activeProvider) {
if (
capability === "image" &&
activeProvider &&
!hasExplicitImageUnderstandingConfig({ cfg, config })
) {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
if (modelSupportsVision(entry)) {

View File

@@ -13,7 +13,14 @@ import { setActivePluginRegistry } from "../plugins/runtime.js";
import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
import { withMediaFixture } from "./runner.test-utils.js";
const baseCatalog = [
type TestCatalogEntry = {
id: string;
name: string;
provider: string;
input: readonly string[];
};
const baseCatalog: TestCatalogEntry[] = [
{
id: "gpt-4.1",
name: "GPT-4.1",
@@ -21,7 +28,7 @@ const baseCatalog = [
input: ["text", "image"] as const,
},
];
let catalog = [...baseCatalog];
let catalog: TestCatalogEntry[] = [...baseCatalog];
const loadModelCatalog = vi.hoisted(() => vi.fn(async () => catalog));
@@ -141,6 +148,117 @@ describe("runCapability image skip", () => {
}
});
it("uses explicit media image models instead of native vision skip", async () => {
await withMediaFixture(
{
filePrefix: "openclaw-image-explicit-vision",
extension: "png",
mediaType: "image/png",
fileContents: Buffer.from("image"),
},
async ({ ctx, media, cache }) => {
const cfg = {} as OpenClawConfig;
const result = await runCapability({
capability: "image",
cfg,
ctx,
attachments: cache,
media,
agentDir: "/tmp",
providerRegistry: new Map([
[
"openrouter",
{
id: "openrouter",
capabilities: ["image"],
describeImage: async (req) => ({ text: "explicit ok", model: req.model }),
},
],
]),
config: {
models: [{ provider: "openrouter", model: "google/gemini-2.5-flash" }],
},
activeModel: { provider: "openai", model: "gpt-4.1" },
});
expect(result.decision.outcome).toBe("success");
expect(result.outputs[0]).toMatchObject({
provider: "openrouter",
model: "google/gemini-2.5-flash",
text: "explicit ok",
});
},
);
});
it("prefers agents.defaults.imageModel over the active model for auto image resolution", async () => {
const cfg = {
agents: {
defaults: {
imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
},
},
} as OpenClawConfig;
await expect(
resolveAutoImageModel({
cfg,
activeModel: { provider: "openai", model: "gpt-4.1" },
}),
).resolves.toEqual({
provider: "openrouter",
model: "google/gemini-2.5-flash",
});
});
it("falls back from an active text model to the provider image default", async () => {
catalog = [
{
id: "MiniMax-M2.7",
name: "MiniMax M2.7",
provider: "minimax-portal",
input: ["text"] as const,
},
{
id: "MiniMax-VL-01",
name: "MiniMax VL 01",
provider: "minimax-portal",
input: ["text", "image"] as const,
},
];
vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
const cfg = {} as OpenClawConfig;
const pluginRegistry = createEmptyPluginRegistry();
pluginRegistry.mediaUnderstandingProviders.push({
pluginId: "minimax",
pluginName: "MiniMax Provider",
source: "test",
provider: {
id: "minimax-portal",
capabilities: ["image"],
defaultModels: { image: "MiniMax-VL-01" },
describeImage: async () => ({ text: "ok" }),
},
});
setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
try {
await expect(
resolveAutoImageModel({
cfg,
activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
}),
).resolves.toEqual({
provider: "minimax-portal",
model: "MiniMax-VL-01",
});
} finally {
setActivePluginRegistry(createEmptyPluginRegistry());
vi.unstubAllEnvs();
}
});
it("uses active OpenRouter image models for auto image resolution", async () => {
vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
const cfg = {} as OpenClawConfig;