From fa7de462610f6699640b9abee650ce14903fe123 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 07:47:15 +0100 Subject: [PATCH] fix(cli): report missing infer media providers --- CHANGELOG.md | 1 + src/cli/capability-cli.test.ts | 69 ++++++++++++++++++++++++ src/cli/capability-cli.ts | 20 +++++++ src/media-understanding/runtime-types.ts | 11 +++- src/media-understanding/runtime.test.ts | 34 ++++++++++++ src/media-understanding/runtime.ts | 16 ++++-- 6 files changed, 145 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6de79545943..bf90a540430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski. - Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar. - Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen. - Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010. diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index 4d2d89d7649..c6f1fee96a8 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -782,6 +782,51 @@ describe("capability cli", () => { ); }); + it("reports missing image understanding configuration for image describe", async () => { + mocks.describeImageFile.mockResolvedValueOnce({ + text: undefined, + decision: { + capability: "image", + outcome: "skipped", + attachments: [{ attachmentIndex: 0, attempts: [] }], + }, + } as never); + + await expect( + runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"], + }), + ).rejects.toThrow("exit 1"); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("No image understanding provider is configured or ready"), + ); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("agents.defaults.imageModel.primary"), + ); + }); + + it("reports missing image understanding configuration for image describe-many", async () => { + mocks.describeImageFile.mockResolvedValueOnce({ + text: undefined, + decision: { + capability: "image", + outcome: "skipped", + attachments: [{ attachmentIndex: 0, attempts: [] }], + }, + } as never); + + await expect( + runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"], + }), + ).rejects.toThrow("exit 1"); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("No image understanding provider is configured or ready"), + ); + }); + it("rewrites mismatched explicit image output extensions to the detected file type", async () => { const jpegBase64 = "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z"; @@ -1278,6 +1323,30 @@ describe("capability cli", () => { ); }); + it("reports missing audio transcription configuration for audio transcribe", async () => { + mocks.transcribeAudioFile.mockResolvedValueOnce({ + text: undefined, + decision: { + capability: "audio", + outcome: "skipped", + attachments: [{ attachmentIndex: 0, attempts: [] }], + }, + } as never); + + await expect( + runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"], + }), + ).rejects.toThrow("exit 1"); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("No audio transcription provider is configured or ready"), + ); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringContaining("tools.media.audio.models"), + ); + }); + it("surfaces the underlying transcription failure for audio transcribe", async () => { mocks.transcribeAudioFile.mockRejectedValueOnce( new Error("Audio transcription response missing text"), diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index 185e0e275ed..f74e693bace 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -30,6 +30,7 @@ import type { ImageGenerationOutputFormat, } from "../image-generation/types.js"; import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js"; +import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js"; import { describeImageFile, describeImageFileWithModel, @@ -964,6 +965,11 @@ async function runImageDescribe(params: { timeoutMs: params.timeoutMs, }); if (!result.text) { + if (isMissingMediaUnderstandingProvider(result)) { + throw new Error( + "No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model after configuring that provider's auth/API key.", + ); + } throw new Error(`No description returned for image: ${resolvedPath}`); } return { @@ -986,6 +992,15 @@ async function runImageDescribe(params: { } satisfies CapabilityEnvelope; } +function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean { + const decision = result.decision; + return ( + decision?.outcome === "skipped" && + decision.attachments.length > 0 && + decision.attachments.every((attachment) => attachment.attempts.length === 0) + ); +} + async function runAudioTranscribe(params: { file: string; language?: string; @@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: { prompt: params.prompt, }); if (!result.text) { + if (isMissingMediaUnderstandingProvider(result)) { + throw new Error( + "No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model after configuring that provider's auth/API key.", + ); + } throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`); } return { diff --git a/src/media-understanding/runtime-types.ts b/src/media-understanding/runtime-types.ts index b31d062eb58..8c4e9831523 100644 --- a/src/media-understanding/runtime-types.ts +++ b/src/media-understanding/runtime-types.ts @@ -1,6 +1,10 @@ import type { OpenClawConfig } from "../config/types.js"; import type { ActiveMediaModel } from "./active-model.types.js"; -import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js"; +import type { + MediaUnderstandingDecision, + MediaUnderstandingOutput, + MediaUnderstandingProvider, +} from "./types.js"; export type RunMediaUnderstandingFileParams = { capability: "image" | "audio" | "video"; @@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = { provider?: string; model?: string; output?: MediaUnderstandingOutput; + decision?: MediaUnderstandingDecision; }; export type DescribeImageFileParams = { @@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = { params: DescribeImageFileWithModelParams, ) => Promise; describeVideoFile: (params: DescribeVideoFileParams) => Promise; - transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>; + transcribeAudioFile: ( + params: TranscribeAudioFileParams, + ) => Promise; }; diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts index f773d6e0011..78b1914d0a6 100644 --- a/src/media-understanding/runtime.test.ts +++ b/src/media-understanding/runtime.test.ts @@ -63,12 +63,46 @@ describe("media-understanding runtime", () => { provider: undefined, model: undefined, output: undefined, + decision: { capability: "image", outcome: "disabled", attachments: [] }, }); expect(mocks.buildProviderRegistry).not.toHaveBeenCalled(); expect(mocks.runCapability).not.toHaveBeenCalled(); }); + it("preserves skipped decisions when no media provider is available", async () => { + const decision = { + capability: "audio" as const, + outcome: "skipped" as const, + attachments: [{ attachmentIndex: 0, attempts: [] }], + }; + mocks.normalizeMediaAttachments.mockReturnValue([ + { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" }, + ]); + mocks.runCapability.mockResolvedValue({ + outputs: [], + decision, + }); + + await expect( + runMediaUnderstandingFile({ + capability: "audio", + filePath: "/tmp/sample.ogg", + mime: "audio/ogg", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + }), + ).resolves.toEqual({ + text: undefined, + provider: undefined, + model: undefined, + output: undefined, + decision, + }); + + expect(mocks.cleanup).toHaveBeenCalledTimes(1); + }); + it("returns the matching capability output", async () => { const output: MediaUnderstandingOutput = { kind: "image.description", diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index 65896ace322..28cacc5ee24 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile( const ctx = buildFileContext(params); const attachments = normalizeMediaAttachments(ctx); if (attachments.length === 0) { - return { text: undefined }; + return { + text: undefined, + decision: { capability: params.capability, outcome: "no-attachment", attachments: [] }, + }; } const config = cfg.tools?.media?.[params.capability]; if (config?.enabled === false) { @@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile( provider: undefined, model: undefined, output: undefined, + decision: { capability: params.capability, outcome: "disabled", attachments: [] }, }; } @@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile( (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability], ); const text = output?.text?.trim(); - return { + const fileResult: RunMediaUnderstandingFileResult = { text: text || undefined, provider: output?.provider, model: output?.model, output, }; + if (result.decision) { + fileResult.decision = result.decision; + } + return fileResult; } finally { await cache.cleanup(); } @@ -171,7 +179,7 @@ export async function describeVideoFile( export async function transcribeAudioFile( params: TranscribeAudioFileParams, -): Promise<{ text: string | undefined }> { +): Promise { const cfg = params.language || params.prompt ? { @@ -192,5 +200,5 @@ export async function transcribeAudioFile( } : params.cfg; const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" }); - return { text: result.text }; + return result; }