fix(cli): report missing infer media providers

This commit is contained in:
Peter Steinberger
2026-05-02 07:47:15 +01:00
parent 798515809c
commit fa7de46261
6 changed files with 145 additions and 6 deletions

View File

@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski.
- Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar.
- Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen.
- Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010.

View File

@@ -782,6 +782,51 @@ describe("capability cli", () => {
);
});
it("reports missing image understanding configuration for image describe", async () => {
mocks.describeImageFile.mockResolvedValueOnce({
text: undefined,
decision: {
capability: "image",
outcome: "skipped",
attachments: [{ attachmentIndex: 0, attempts: [] }],
},
} as never);
await expect(
runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"],
}),
).rejects.toThrow("exit 1");
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringContaining("No image understanding provider is configured or ready"),
);
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringContaining("agents.defaults.imageModel.primary"),
);
});
it("reports missing image understanding configuration for image describe-many", async () => {
mocks.describeImageFile.mockResolvedValueOnce({
text: undefined,
decision: {
capability: "image",
outcome: "skipped",
attachments: [{ attachmentIndex: 0, attempts: [] }],
},
} as never);
await expect(
runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"],
}),
).rejects.toThrow("exit 1");
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringContaining("No image understanding provider is configured or ready"),
);
});
it("rewrites mismatched explicit image output extensions to the detected file type", async () => {
const jpegBase64 =
"/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z";
@@ -1278,6 +1323,30 @@ describe("capability cli", () => {
);
});
it("reports missing audio transcription configuration for audio transcribe", async () => {
mocks.transcribeAudioFile.mockResolvedValueOnce({
text: undefined,
decision: {
capability: "audio",
outcome: "skipped",
attachments: [{ attachmentIndex: 0, attempts: [] }],
},
} as never);
await expect(
runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
}),
).rejects.toThrow("exit 1");
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringContaining("No audio transcription provider is configured or ready"),
);
expect(mocks.runtime.error).toHaveBeenCalledWith(
expect.stringContaining("tools.media.audio.models"),
);
});
it("surfaces the underlying transcription failure for audio transcribe", async () => {
mocks.transcribeAudioFile.mockRejectedValueOnce(
new Error("Audio transcription response missing text"),

View File

@@ -30,6 +30,7 @@ import type {
ImageGenerationOutputFormat,
} from "../image-generation/types.js";
import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js";
import {
describeImageFile,
describeImageFileWithModel,
@@ -964,6 +965,11 @@ async function runImageDescribe(params: {
timeoutMs: params.timeoutMs,
});
if (!result.text) {
if (isMissingMediaUnderstandingProvider(result)) {
throw new Error(
"No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model <provider/model> after configuring that provider's auth/API key.",
);
}
throw new Error(`No description returned for image: ${resolvedPath}`);
}
return {
@@ -986,6 +992,15 @@ async function runImageDescribe(params: {
} satisfies CapabilityEnvelope;
}
function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean {
const decision = result.decision;
return (
decision?.outcome === "skipped" &&
decision.attachments.length > 0 &&
decision.attachments.every((attachment) => attachment.attempts.length === 0)
);
}
async function runAudioTranscribe(params: {
file: string;
language?: string;
@@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: {
prompt: params.prompt,
});
if (!result.text) {
if (isMissingMediaUnderstandingProvider(result)) {
throw new Error(
"No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model <provider/model> after configuring that provider's auth/API key.",
);
}
throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`);
}
return {

View File

@@ -1,6 +1,10 @@
import type { OpenClawConfig } from "../config/types.js";
import type { ActiveMediaModel } from "./active-model.types.js";
import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
import type {
MediaUnderstandingDecision,
MediaUnderstandingOutput,
MediaUnderstandingProvider,
} from "./types.js";
export type RunMediaUnderstandingFileParams = {
capability: "image" | "audio" | "video";
@@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = {
provider?: string;
model?: string;
output?: MediaUnderstandingOutput;
decision?: MediaUnderstandingDecision;
};
export type DescribeImageFileParams = {
@@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = {
params: DescribeImageFileWithModelParams,
) => Promise<DescribeImageFileWithModelResult>;
describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
transcribeAudioFile: (
params: TranscribeAudioFileParams,
) => Promise<RunMediaUnderstandingFileResult>;
};

View File

@@ -63,12 +63,46 @@ describe("media-understanding runtime", () => {
provider: undefined,
model: undefined,
output: undefined,
decision: { capability: "image", outcome: "disabled", attachments: [] },
});
expect(mocks.buildProviderRegistry).not.toHaveBeenCalled();
expect(mocks.runCapability).not.toHaveBeenCalled();
});
it("preserves skipped decisions when no media provider is available", async () => {
const decision = {
capability: "audio" as const,
outcome: "skipped" as const,
attachments: [{ attachmentIndex: 0, attempts: [] }],
};
mocks.normalizeMediaAttachments.mockReturnValue([
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
]);
mocks.runCapability.mockResolvedValue({
outputs: [],
decision,
});
await expect(
runMediaUnderstandingFile({
capability: "audio",
filePath: "/tmp/sample.ogg",
mime: "audio/ogg",
cfg: {} as OpenClawConfig,
agentDir: "/tmp/agent",
}),
).resolves.toEqual({
text: undefined,
provider: undefined,
model: undefined,
output: undefined,
decision,
});
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
});
it("returns the matching capability output", async () => {
const output: MediaUnderstandingOutput = {
kind: "image.description",

View File

@@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile(
const ctx = buildFileContext(params);
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
return {
text: undefined,
decision: { capability: params.capability, outcome: "no-attachment", attachments: [] },
};
}
const config = cfg.tools?.media?.[params.capability];
if (config?.enabled === false) {
@@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile(
provider: undefined,
model: undefined,
output: undefined,
decision: { capability: params.capability, outcome: "disabled", attachments: [] },
};
}
@@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile(
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
);
const text = output?.text?.trim();
return {
const fileResult: RunMediaUnderstandingFileResult = {
text: text || undefined,
provider: output?.provider,
model: output?.model,
output,
};
if (result.decision) {
fileResult.decision = result.decision;
}
return fileResult;
} finally {
await cache.cleanup();
}
@@ -171,7 +179,7 @@ export async function describeVideoFile(
export async function transcribeAudioFile(
params: TranscribeAudioFileParams,
): Promise<{ text: string | undefined }> {
): Promise<RunMediaUnderstandingFileResult> {
const cfg =
params.language || params.prompt
? {
@@ -192,5 +200,5 @@ export async function transcribeAudioFile(
}
: params.cfg;
const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" });
return { text: result.text };
return result;
}