From e71e5433505f0820b238b8328d825b7607f7d188 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 21 Apr 2026 22:19:19 +0100 Subject: [PATCH] fix: route explicit image describe models --- src/cli/capability-cli.test.ts | 38 +++++++++++++++++++++++++++ src/cli/capability-cli.ts | 28 ++++++++++++++------ src/media-understanding/image.test.ts | 16 +++++++++++ src/media-understanding/image.ts | 7 +++++ 4 files changed, 81 insertions(+), 8 deletions(-) diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index 190d28e2e2d..5018bba941f 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -57,6 +57,10 @@ const mocks = vi.hoisted(() => ({ provider: "openai", model: "gpt-4.1-mini", })), + describeImageFileWithModel: vi.fn(async () => ({ + text: "friendly lobster", + model: "gpt-4.1-mini", + })), generateImage: vi.fn(), generateVideo: vi.fn(), transcribeAudioFile: vi.fn(async () => ({ text: "meeting notes" })), @@ -179,6 +183,8 @@ vi.mock("../gateway/connection-details.js", () => ({ vi.mock("../media-understanding/runtime.js", () => ({ describeImageFile: mocks.describeImageFile as typeof import("../media-understanding/runtime.js").describeImageFile, + describeImageFileWithModel: + mocks.describeImageFileWithModel as typeof import("../media-understanding/runtime.js").describeImageFileWithModel, describeVideoFile: vi.fn(), transcribeAudioFile: mocks.transcribeAudioFile as typeof import("../media-understanding/runtime.js").transcribeAudioFile, @@ -289,6 +295,7 @@ describe("capability cli", () => { return {}; }) as never); mocks.describeImageFile.mockClear(); + mocks.describeImageFileWithModel.mockClear(); mocks.generateImage.mockReset(); mocks.generateVideo.mockReset(); mocks.transcribeAudioFile.mockClear(); @@ -384,6 +391,37 @@ describe("capability cli", () => { ); }); + it("uses the explicit media-understanding provider for image describe model overrides", async () => { + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "image", + "describe", + "--file", + "photo.jpg", + "--model", + "ollama/qwen2.5vl:7b", + "--json", + ], + }); + + expect(mocks.describeImageFileWithModel).toHaveBeenCalledWith( + expect.objectContaining({ + filePath: expect.stringMatching(/photo\.jpg$/), + provider: "ollama", + model: "qwen2.5vl:7b", + }), + ); + expect(mocks.describeImageFile).not.toHaveBeenCalled(); + expect(mocks.runtime.writeJson).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "ollama", + model: "gpt-4.1-mini", + }), + ); + }); + it("fails image describe when no description text is returned", async () => { mocks.describeImageFile.mockResolvedValueOnce({ text: undefined, diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index a98c674e7b3..524e3e91306 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -25,6 +25,7 @@ import { generateImage, listRuntimeImageGenerationProviders } from "../image-gen import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js"; import { describeImageFile, + describeImageFileWithModel, describeVideoFile, transcribeAudioFile, } from "../media-understanding/runtime.js"; @@ -749,21 +750,32 @@ async function runImageDescribe(params: { model?: string; }) { const cfg = loadConfig(); + const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg)); const activeModel = requireProviderModelOverride(params.model); const outputs = await Promise.all( params.files.map(async (filePath) => { - const result = await describeImageFile({ - filePath: path.resolve(filePath), - cfg, - activeModel, - }); + const resolvedPath = path.resolve(filePath); + const result = activeModel + ? await describeImageFileWithModel({ + filePath: resolvedPath, + cfg, + agentDir, + provider: activeModel.provider, + model: activeModel.model, + prompt: "Describe the image.", + }) + : await describeImageFile({ + filePath: resolvedPath, + cfg, + agentDir, + }); if (!result.text) { - throw new Error(`No description returned for image: ${path.resolve(filePath)}`); + throw new Error(`No description returned for image: ${resolvedPath}`); } return { - path: path.resolve(filePath), + path: resolvedPath, text: result.text, - provider: result.provider, + provider: activeModel?.provider ?? ("provider" in result ? result.provider : undefined), model: result.model, kind: "image.description", }; diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index d603eb3969a..7f193730e9f 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -17,6 +17,7 @@ const hoisted = vi.hoisted(() => ({ setRuntimeApiKeyMock: vi.fn(), discoverModelsMock: vi.fn(), fetchMock: vi.fn(), + registerProviderStreamForModelMock: vi.fn(), })); const { completeMock, @@ -27,6 +28,7 @@ const { setRuntimeApiKeyMock, discoverModelsMock, fetchMock, + registerProviderStreamForModelMock, } = hoisted; vi.mock("@mariozechner/pi-ai", async () => { @@ -50,6 +52,10 @@ vi.mock("../agents/model-auth.js", () => ({ requireApiKey: requireApiKeyMock, })); +vi.mock("../agents/provider-stream.js", () => ({ + registerProviderStreamForModel: registerProviderStreamForModelMock, +})); + vi.mock("../agents/pi-model-discovery-runtime.js", () => ({ discoverAuthStorage: () => ({ setRuntimeApiKey: setRuntimeApiKeyMock, @@ -168,6 +174,16 @@ describe("describeImageWithModel", () => { text: "generic ok", model: "custom-vision", }); + expect(registerProviderStreamForModelMock).toHaveBeenCalledWith( + expect.objectContaining({ + model: expect.objectContaining({ + provider: "minimax-portal", + id: "custom-vision", + }), + cfg: {}, + agentDir: "/tmp/openclaw-agent", + }), + ); expect(completeMock).toHaveBeenCalledOnce(); expect(fetchMock).not.toHaveBeenCalled(); }); diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index 5fd24d8b26e..1ee3ec5bdfc 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -9,6 +9,7 @@ import { import { normalizeModelRef } from "../agents/model-selection.js"; import { ensureOpenClawModelsJson } from "../agents/models-config.js"; import { resolveProviderRequestCapabilities } from "../agents/provider-attribution.js"; +import { registerProviderStreamForModel } from "../agents/provider-stream.js"; import { coerceImageAssistantText, hasImageReasoningOnlyResponse, @@ -245,6 +246,12 @@ export async function describeImagesWithModel( }); } + registerProviderStreamForModel({ + model, + cfg: params.cfg, + agentDir: params.agentDir, + }); + const context = buildImageContext(prompt, params.images); const controller = new AbortController(); const timeout =