mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:40:44 +00:00
fix: honor explicit media image model routing
This commit is contained in:
@@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Media understanding: honor explicit image-model configuration before native-vision skips, including `agents.defaults.imageModel`, `tools.media.image.models`, and provider image defaults such as MiniMax VL when the active chat model is text-only. Fixes #47614, #63722, #69171.
|
||||
- Codex/media understanding: support `codex/*` image models through bounded Codex app-server image turns, while keeping `openai-codex/*` on the OpenAI Codex OAuth route and validating app-server responses against generated protocol contracts. Fixes #70201.
|
||||
- Providers/OpenAI Codex: synthesize the `openai-codex/gpt-5.5` OAuth model row when Codex catalog discovery omits it, so cron and subagent runs do not fail with `Unknown model` while the account is authenticated.
|
||||
- Models/CLI: keep `openclaw models list` read-only while still showing eligible configured-provider rows, so listing models no longer rewrites per-agent `models.json`. (#70847) Thanks @shakkernerd.
|
||||
|
||||
@@ -85,6 +85,15 @@ function resolveConfiguredImageModelId(params: {
|
||||
cfg: OpenClawConfig;
|
||||
providerId: string;
|
||||
}): string | undefined {
|
||||
const configured = resolveConfiguredImageModel(params);
|
||||
const id = configured?.id?.trim();
|
||||
return id || undefined;
|
||||
}
|
||||
|
||||
function resolveConfiguredImageModel(params: {
|
||||
cfg: OpenClawConfig;
|
||||
providerId: string;
|
||||
}): { id?: string; input?: string[] } | undefined {
|
||||
const providerCfg = findNormalizedProviderValue(
|
||||
params.cfg.models?.providers,
|
||||
params.providerId,
|
||||
@@ -96,12 +105,10 @@ function resolveConfiguredImageModelId(params: {
|
||||
}>;
|
||||
}
|
||||
| undefined;
|
||||
const configured = providerCfg?.models?.find((entry) => {
|
||||
return providerCfg?.models?.find((entry) => {
|
||||
const id = entry?.id?.trim();
|
||||
return Boolean(id) && entry?.input?.includes("image");
|
||||
});
|
||||
const id = configured?.id?.trim();
|
||||
return id || undefined;
|
||||
}
|
||||
|
||||
function resolveCatalogImageModelId(params: {
|
||||
@@ -119,6 +126,23 @@ function resolveCatalogImageModelId(params: {
|
||||
return normalizeOptionalString((autoEntry ?? matches[0])?.id);
|
||||
}
|
||||
|
||||
async function explicitImageModelVisionStatus(params: {
|
||||
cfg: OpenClawConfig;
|
||||
providerId: string;
|
||||
model: string;
|
||||
}): Promise<"supported" | "unsupported" | "unknown"> {
|
||||
const configured = resolveConfiguredImageModel(params);
|
||||
if (configured?.id?.trim() === params.model && configured.input?.includes("image")) {
|
||||
return "supported";
|
||||
}
|
||||
const catalog = await loadModelCatalog({ config: params.cfg });
|
||||
const entry = findModelInCatalog(catalog, params.providerId, params.model);
|
||||
if (!entry) {
|
||||
return "unknown";
|
||||
}
|
||||
return modelSupportsVision(entry) ? "supported" : "unsupported";
|
||||
}
|
||||
|
||||
async function resolveAutoImageModelId(params: {
|
||||
cfg: OpenClawConfig;
|
||||
providerId: string;
|
||||
@@ -126,7 +150,14 @@ async function resolveAutoImageModelId(params: {
|
||||
}): Promise<string | undefined> {
|
||||
const explicit = normalizeOptionalString(params.explicitModel);
|
||||
if (explicit) {
|
||||
return explicit;
|
||||
const explicitStatus = await explicitImageModelVisionStatus({
|
||||
cfg: params.cfg,
|
||||
providerId: params.providerId,
|
||||
model: explicit,
|
||||
});
|
||||
if (explicitStatus !== "unsupported") {
|
||||
return explicit;
|
||||
}
|
||||
}
|
||||
const configuredModel = resolveConfiguredImageModelId(params);
|
||||
if (configuredModel) {
|
||||
@@ -498,6 +529,16 @@ function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstan
|
||||
return entries;
|
||||
}
|
||||
|
||||
function hasExplicitImageUnderstandingConfig(params: {
|
||||
cfg: OpenClawConfig;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): boolean {
|
||||
return (
|
||||
(params.config?.models?.length ?? 0) > 0 ||
|
||||
resolveImageModelFromAgentDefaults(params.cfg).length > 0
|
||||
);
|
||||
}
|
||||
|
||||
async function resolveAutoEntries(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
@@ -505,6 +546,12 @@ async function resolveAutoEntries(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig[]> {
|
||||
if (params.capability === "image") {
|
||||
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
|
||||
if (imageModelEntries.length > 0) {
|
||||
return imageModelEntries;
|
||||
}
|
||||
}
|
||||
const activeEntry = await resolveActiveModelEntry(params);
|
||||
if (activeEntry) {
|
||||
return [activeEntry];
|
||||
@@ -519,12 +566,6 @@ async function resolveAutoEntries(params: {
|
||||
return [localAudio];
|
||||
}
|
||||
}
|
||||
if (params.capability === "image") {
|
||||
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
|
||||
if (imageModelEntries.length > 0) {
|
||||
return imageModelEntries;
|
||||
}
|
||||
}
|
||||
const gemini = await resolveGeminiCliEntry(params.capability);
|
||||
if (gemini) {
|
||||
return [gemini];
|
||||
@@ -553,6 +594,12 @@ export async function resolveAutoImageModel(params: {
|
||||
}
|
||||
return { provider, model };
|
||||
};
|
||||
const configuredImageModel = resolveImageModelFromAgentDefaults(params.cfg)
|
||||
.map((entry) => toActive(entry))
|
||||
.find((entry): entry is ActiveMediaModel => entry !== null);
|
||||
if (configuredImageModel) {
|
||||
return configuredImageModel;
|
||||
}
|
||||
const activeEntry = await resolveActiveModelEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
@@ -772,7 +819,11 @@ export async function runCapability(params: {
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (capability === "image" && activeProvider) {
|
||||
if (
|
||||
capability === "image" &&
|
||||
activeProvider &&
|
||||
!hasExplicitImageUnderstandingConfig({ cfg, config })
|
||||
) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
||||
if (modelSupportsVision(entry)) {
|
||||
|
||||
@@ -13,7 +13,14 @@ import { setActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import { createMediaAttachmentCache, normalizeMediaAttachments } from "./runner.attachments.js";
|
||||
import { withMediaFixture } from "./runner.test-utils.js";
|
||||
|
||||
const baseCatalog = [
|
||||
type TestCatalogEntry = {
|
||||
id: string;
|
||||
name: string;
|
||||
provider: string;
|
||||
input: readonly string[];
|
||||
};
|
||||
|
||||
const baseCatalog: TestCatalogEntry[] = [
|
||||
{
|
||||
id: "gpt-4.1",
|
||||
name: "GPT-4.1",
|
||||
@@ -21,7 +28,7 @@ const baseCatalog = [
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
];
|
||||
let catalog = [...baseCatalog];
|
||||
let catalog: TestCatalogEntry[] = [...baseCatalog];
|
||||
|
||||
const loadModelCatalog = vi.hoisted(() => vi.fn(async () => catalog));
|
||||
|
||||
@@ -141,6 +148,117 @@ describe("runCapability image skip", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("uses explicit media image models instead of native vision skip", async () => {
|
||||
await withMediaFixture(
|
||||
{
|
||||
filePrefix: "openclaw-image-explicit-vision",
|
||||
extension: "png",
|
||||
mediaType: "image/png",
|
||||
fileContents: Buffer.from("image"),
|
||||
},
|
||||
async ({ ctx, media, cache }) => {
|
||||
const cfg = {} as OpenClawConfig;
|
||||
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
agentDir: "/tmp",
|
||||
providerRegistry: new Map([
|
||||
[
|
||||
"openrouter",
|
||||
{
|
||||
id: "openrouter",
|
||||
capabilities: ["image"],
|
||||
describeImage: async (req) => ({ text: "explicit ok", model: req.model }),
|
||||
},
|
||||
],
|
||||
]),
|
||||
config: {
|
||||
models: [{ provider: "openrouter", model: "google/gemini-2.5-flash" }],
|
||||
},
|
||||
activeModel: { provider: "openai", model: "gpt-4.1" },
|
||||
});
|
||||
|
||||
expect(result.decision.outcome).toBe("success");
|
||||
expect(result.outputs[0]).toMatchObject({
|
||||
provider: "openrouter",
|
||||
model: "google/gemini-2.5-flash",
|
||||
text: "explicit ok",
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("prefers agents.defaults.imageModel over the active model for auto image resolution", async () => {
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
imageModel: { primary: "openrouter/google/gemini-2.5-flash" },
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
|
||||
await expect(
|
||||
resolveAutoImageModel({
|
||||
cfg,
|
||||
activeModel: { provider: "openai", model: "gpt-4.1" },
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
provider: "openrouter",
|
||||
model: "google/gemini-2.5-flash",
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back from an active text model to the provider image default", async () => {
|
||||
catalog = [
|
||||
{
|
||||
id: "MiniMax-M2.7",
|
||||
name: "MiniMax M2.7",
|
||||
provider: "minimax-portal",
|
||||
input: ["text"] as const,
|
||||
},
|
||||
{
|
||||
id: "MiniMax-VL-01",
|
||||
name: "MiniMax VL 01",
|
||||
provider: "minimax-portal",
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
];
|
||||
vi.stubEnv("MINIMAX_API_KEY", "test-minimax-key");
|
||||
const cfg = {} as OpenClawConfig;
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "minimax",
|
||||
pluginName: "MiniMax Provider",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "MiniMax-VL-01" },
|
||||
describeImage: async () => ({ text: "ok" }),
|
||||
},
|
||||
});
|
||||
setCompatibleActiveMediaUnderstandingRegistry(pluginRegistry, cfg);
|
||||
|
||||
try {
|
||||
await expect(
|
||||
resolveAutoImageModel({
|
||||
cfg,
|
||||
activeModel: { provider: "minimax-portal", model: "MiniMax-M2.7" },
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
provider: "minimax-portal",
|
||||
model: "MiniMax-VL-01",
|
||||
});
|
||||
} finally {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
vi.unstubAllEnvs();
|
||||
}
|
||||
});
|
||||
|
||||
it("uses active OpenRouter image models for auto image resolution", async () => {
|
||||
vi.stubEnv("OPENROUTER_API_KEY", "test-openrouter-key");
|
||||
const cfg = {} as OpenClawConfig;
|
||||
|
||||
Reference in New Issue
Block a user