mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:10:44 +00:00
fix(cli): report missing infer media providers
This commit is contained in:
@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski.
|
||||
- Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar.
|
||||
- Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen.
|
||||
- Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010.
|
||||
|
||||
@@ -782,6 +782,51 @@ describe("capability cli", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("reports missing image understanding configuration for image describe", async () => {
|
||||
mocks.describeImageFile.mockResolvedValueOnce({
|
||||
text: undefined,
|
||||
decision: {
|
||||
capability: "image",
|
||||
outcome: "skipped",
|
||||
attachments: [{ attachmentIndex: 0, attempts: [] }],
|
||||
},
|
||||
} as never);
|
||||
|
||||
await expect(
|
||||
runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"],
|
||||
}),
|
||||
).rejects.toThrow("exit 1");
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining("No image understanding provider is configured or ready"),
|
||||
);
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining("agents.defaults.imageModel.primary"),
|
||||
);
|
||||
});
|
||||
|
||||
it("reports missing image understanding configuration for image describe-many", async () => {
|
||||
mocks.describeImageFile.mockResolvedValueOnce({
|
||||
text: undefined,
|
||||
decision: {
|
||||
capability: "image",
|
||||
outcome: "skipped",
|
||||
attachments: [{ attachmentIndex: 0, attempts: [] }],
|
||||
},
|
||||
} as never);
|
||||
|
||||
await expect(
|
||||
runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"],
|
||||
}),
|
||||
).rejects.toThrow("exit 1");
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining("No image understanding provider is configured or ready"),
|
||||
);
|
||||
});
|
||||
|
||||
it("rewrites mismatched explicit image output extensions to the detected file type", async () => {
|
||||
const jpegBase64 =
|
||||
"/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z";
|
||||
@@ -1278,6 +1323,30 @@ describe("capability cli", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("reports missing audio transcription configuration for audio transcribe", async () => {
|
||||
mocks.transcribeAudioFile.mockResolvedValueOnce({
|
||||
text: undefined,
|
||||
decision: {
|
||||
capability: "audio",
|
||||
outcome: "skipped",
|
||||
attachments: [{ attachmentIndex: 0, attempts: [] }],
|
||||
},
|
||||
} as never);
|
||||
|
||||
await expect(
|
||||
runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
|
||||
}),
|
||||
).rejects.toThrow("exit 1");
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining("No audio transcription provider is configured or ready"),
|
||||
);
|
||||
expect(mocks.runtime.error).toHaveBeenCalledWith(
|
||||
expect.stringContaining("tools.media.audio.models"),
|
||||
);
|
||||
});
|
||||
|
||||
it("surfaces the underlying transcription failure for audio transcribe", async () => {
|
||||
mocks.transcribeAudioFile.mockRejectedValueOnce(
|
||||
new Error("Audio transcription response missing text"),
|
||||
|
||||
@@ -30,6 +30,7 @@ import type {
|
||||
ImageGenerationOutputFormat,
|
||||
} from "../image-generation/types.js";
|
||||
import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
|
||||
import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js";
|
||||
import {
|
||||
describeImageFile,
|
||||
describeImageFileWithModel,
|
||||
@@ -964,6 +965,11 @@ async function runImageDescribe(params: {
|
||||
timeoutMs: params.timeoutMs,
|
||||
});
|
||||
if (!result.text) {
|
||||
if (isMissingMediaUnderstandingProvider(result)) {
|
||||
throw new Error(
|
||||
"No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model <provider/model> after configuring that provider's auth/API key.",
|
||||
);
|
||||
}
|
||||
throw new Error(`No description returned for image: ${resolvedPath}`);
|
||||
}
|
||||
return {
|
||||
@@ -986,6 +992,15 @@ async function runImageDescribe(params: {
|
||||
} satisfies CapabilityEnvelope;
|
||||
}
|
||||
|
||||
function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean {
|
||||
const decision = result.decision;
|
||||
return (
|
||||
decision?.outcome === "skipped" &&
|
||||
decision.attachments.length > 0 &&
|
||||
decision.attachments.every((attachment) => attachment.attempts.length === 0)
|
||||
);
|
||||
}
|
||||
|
||||
async function runAudioTranscribe(params: {
|
||||
file: string;
|
||||
language?: string;
|
||||
@@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: {
|
||||
prompt: params.prompt,
|
||||
});
|
||||
if (!result.text) {
|
||||
if (isMissingMediaUnderstandingProvider(result)) {
|
||||
throw new Error(
|
||||
"No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model <provider/model> after configuring that provider's auth/API key.",
|
||||
);
|
||||
}
|
||||
throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`);
|
||||
}
|
||||
return {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import type { OpenClawConfig } from "../config/types.js";
|
||||
import type { ActiveMediaModel } from "./active-model.types.js";
|
||||
import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
|
||||
import type {
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "./types.js";
|
||||
|
||||
export type RunMediaUnderstandingFileParams = {
|
||||
capability: "image" | "audio" | "video";
|
||||
@@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = {
|
||||
provider?: string;
|
||||
model?: string;
|
||||
output?: MediaUnderstandingOutput;
|
||||
decision?: MediaUnderstandingDecision;
|
||||
};
|
||||
|
||||
export type DescribeImageFileParams = {
|
||||
@@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = {
|
||||
params: DescribeImageFileWithModelParams,
|
||||
) => Promise<DescribeImageFileWithModelResult>;
|
||||
describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
|
||||
transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
|
||||
transcribeAudioFile: (
|
||||
params: TranscribeAudioFileParams,
|
||||
) => Promise<RunMediaUnderstandingFileResult>;
|
||||
};
|
||||
|
||||
@@ -63,12 +63,46 @@ describe("media-understanding runtime", () => {
|
||||
provider: undefined,
|
||||
model: undefined,
|
||||
output: undefined,
|
||||
decision: { capability: "image", outcome: "disabled", attachments: [] },
|
||||
});
|
||||
|
||||
expect(mocks.buildProviderRegistry).not.toHaveBeenCalled();
|
||||
expect(mocks.runCapability).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("preserves skipped decisions when no media provider is available", async () => {
|
||||
const decision = {
|
||||
capability: "audio" as const,
|
||||
outcome: "skipped" as const,
|
||||
attachments: [{ attachmentIndex: 0, attempts: [] }],
|
||||
};
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
|
||||
]);
|
||||
mocks.runCapability.mockResolvedValue({
|
||||
outputs: [],
|
||||
decision,
|
||||
});
|
||||
|
||||
await expect(
|
||||
runMediaUnderstandingFile({
|
||||
capability: "audio",
|
||||
filePath: "/tmp/sample.ogg",
|
||||
mime: "audio/ogg",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
text: undefined,
|
||||
provider: undefined,
|
||||
model: undefined,
|
||||
output: undefined,
|
||||
decision,
|
||||
});
|
||||
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("returns the matching capability output", async () => {
|
||||
const output: MediaUnderstandingOutput = {
|
||||
kind: "image.description",
|
||||
|
||||
@@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile(
|
||||
const ctx = buildFileContext(params);
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
if (attachments.length === 0) {
|
||||
return { text: undefined };
|
||||
return {
|
||||
text: undefined,
|
||||
decision: { capability: params.capability, outcome: "no-attachment", attachments: [] },
|
||||
};
|
||||
}
|
||||
const config = cfg.tools?.media?.[params.capability];
|
||||
if (config?.enabled === false) {
|
||||
@@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile(
|
||||
provider: undefined,
|
||||
model: undefined,
|
||||
output: undefined,
|
||||
decision: { capability: params.capability, outcome: "disabled", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
@@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile(
|
||||
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
|
||||
);
|
||||
const text = output?.text?.trim();
|
||||
return {
|
||||
const fileResult: RunMediaUnderstandingFileResult = {
|
||||
text: text || undefined,
|
||||
provider: output?.provider,
|
||||
model: output?.model,
|
||||
output,
|
||||
};
|
||||
if (result.decision) {
|
||||
fileResult.decision = result.decision;
|
||||
}
|
||||
return fileResult;
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
@@ -171,7 +179,7 @@ export async function describeVideoFile(
|
||||
|
||||
export async function transcribeAudioFile(
|
||||
params: TranscribeAudioFileParams,
|
||||
): Promise<{ text: string | undefined }> {
|
||||
): Promise<RunMediaUnderstandingFileResult> {
|
||||
const cfg =
|
||||
params.language || params.prompt
|
||||
? {
|
||||
@@ -192,5 +200,5 @@ export async function transcribeAudioFile(
|
||||
}
|
||||
: params.cfg;
|
||||
const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" });
|
||||
return { text: result.text };
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user