fix: route explicit image describe models

This commit is contained in:
Peter Steinberger
2026-04-21 22:19:19 +01:00
parent a7ff7dd945
commit e71e543350
4 changed files with 81 additions and 8 deletions

View File

@@ -57,6 +57,10 @@ const mocks = vi.hoisted(() => ({
provider: "openai",
model: "gpt-4.1-mini",
})),
describeImageFileWithModel: vi.fn(async () => ({
text: "friendly lobster",
model: "gpt-4.1-mini",
})),
generateImage: vi.fn(),
generateVideo: vi.fn(),
transcribeAudioFile: vi.fn(async () => ({ text: "meeting notes" })),
@@ -179,6 +183,8 @@ vi.mock("../gateway/connection-details.js", () => ({
vi.mock("../media-understanding/runtime.js", () => ({
describeImageFile:
mocks.describeImageFile as typeof import("../media-understanding/runtime.js").describeImageFile,
describeImageFileWithModel:
mocks.describeImageFileWithModel as typeof import("../media-understanding/runtime.js").describeImageFileWithModel,
describeVideoFile: vi.fn(),
transcribeAudioFile:
mocks.transcribeAudioFile as typeof import("../media-understanding/runtime.js").transcribeAudioFile,
@@ -289,6 +295,7 @@ describe("capability cli", () => {
return {};
}) as never);
mocks.describeImageFile.mockClear();
mocks.describeImageFileWithModel.mockClear();
mocks.generateImage.mockReset();
mocks.generateVideo.mockReset();
mocks.transcribeAudioFile.mockClear();
@@ -384,6 +391,37 @@ describe("capability cli", () => {
);
});
it("uses the explicit media-understanding provider for image describe model overrides", async () => {
await runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: [
"capability",
"image",
"describe",
"--file",
"photo.jpg",
"--model",
"ollama/qwen2.5vl:7b",
"--json",
],
});
expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith(
expect.objectContaining({
filePath: expect.stringMatching(/photo\.jpg$/),
provider: "ollama",
model: "qwen2.5vl:7b",
}),
);
expect(mocks.describeImageFile).not.toHaveBeenCalled();
expect(mocks.runtime.writeJson).toHaveBeenCalledWith(
expect.objectContaining({
provider: "ollama",
model: "gpt-4.1-mini",
}),
);
});
it("fails image describe when no description text is returned", async () => {
mocks.describeImageFile.mockResolvedValueOnce({
text: undefined,

View File

@@ -25,6 +25,7 @@ import { generateImage, listRuntimeImageGenerationProviders } from "../image-gen
import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
import {
describeImageFile,
describeImageFileWithModel,
describeVideoFile,
transcribeAudioFile,
} from "../media-understanding/runtime.js";
@@ -749,21 +750,32 @@ async function runImageDescribe(params: {
model?: string;
}) {
const cfg = loadConfig();
const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg));
const activeModel = requireProviderModelOverride(params.model);
const outputs = await Promise.all(
params.files.map(async (filePath) => {
const result = await describeImageFile({
filePath: path.resolve(filePath),
cfg,
activeModel,
});
const resolvedPath = path.resolve(filePath);
const result = activeModel
? await describeImageFileWithModel({
filePath: resolvedPath,
cfg,
agentDir,
provider: activeModel.provider,
model: activeModel.model,
prompt: "Describe the image.",
})
: await describeImageFile({
filePath: resolvedPath,
cfg,
agentDir,
});
if (!result.text) {
throw new Error(`No description returned for image: ${path.resolve(filePath)}`);
throw new Error(`No description returned for image: ${resolvedPath}`);
}
return {
path: path.resolve(filePath),
path: resolvedPath,
text: result.text,
provider: result.provider,
provider: activeModel?.provider ?? ("provider" in result ? result.provider : undefined),
model: result.model,
kind: "image.description",
};

View File

@@ -17,6 +17,7 @@ const hoisted = vi.hoisted(() => ({
setRuntimeApiKeyMock: vi.fn(),
discoverModelsMock: vi.fn(),
fetchMock: vi.fn(),
registerProviderStreamForModelMock: vi.fn(),
}));
const {
completeMock,
@@ -27,6 +28,7 @@ const {
setRuntimeApiKeyMock,
discoverModelsMock,
fetchMock,
registerProviderStreamForModelMock,
} = hoisted;
vi.mock("@mariozechner/pi-ai", async () => {
@@ -50,6 +52,10 @@ vi.mock("../agents/model-auth.js", () => ({
requireApiKey: requireApiKeyMock,
}));
vi.mock("../agents/provider-stream.js", () => ({
registerProviderStreamForModel: registerProviderStreamForModelMock,
}));
vi.mock("../agents/pi-model-discovery-runtime.js", () => ({
discoverAuthStorage: () => ({
setRuntimeApiKey: setRuntimeApiKeyMock,
@@ -168,6 +174,16 @@ describe("describeImageWithModel", () => {
text: "generic ok",
model: "custom-vision",
});
expect(registerProviderStreamForModelMock).toHaveBeenCalledWith(
expect.objectContaining({
model: expect.objectContaining({
provider: "minimax-portal",
id: "custom-vision",
}),
cfg: {},
agentDir: "/tmp/openclaw-agent",
}),
);
expect(completeMock).toHaveBeenCalledOnce();
expect(fetchMock).not.toHaveBeenCalled();
});

View File

@@ -9,6 +9,7 @@ import {
import { normalizeModelRef } from "../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js";
import { registerProviderStreamForModel } from "../agents/provider-stream.js";
import {
coerceImageAssistantText,
hasImageReasoningOnlyResponse,
@@ -245,6 +246,12 @@ export async function describeImagesWithModel(
});
}
registerProviderStreamForModel({
model,
cfg: params.cfg,
agentDir: params.agentDir,
});
const context = buildImageContext(prompt, params.images);
const controller = new AbortController();
const timeout =