fix(cli): wire image describe prompt options

This commit is contained in:
Peter Steinberger
2026-04-28 10:53:47 +01:00
parent 0bc8b9a95a
commit 6f8792f3f1
7 changed files with 187 additions and 20 deletions

View File

@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
- Build/runtime: preserve staged bundled-plugin runtime dependency caches across source-checkout tsdown rebuilds, so local CLI and gateway-watch rebuilds no longer recreate large plugin dependency trees before starting. Refs #73205. Thanks @SymbolStar.
- CLI/channels: list configured chat channel accounts from read-only setup metadata even when the standalone CLI has not loaded the runtime channel registry, so `openclaw channels list` shows Telegram accounts before auth providers. Fixes #73319 and #73322. Thanks @mlaihk.
- CLI/model probes: keep `infer model run --gateway` raw by skipping prior session transcript, bootstrap context, context-engine assembly, tools, and bundled MCP servers, so local backends can be tested without full agent-context overhead. Fixes #73308. Thanks @ScientificProgrammer.
- CLI/image describe: pass `--prompt` and `--timeout-ms` through `infer image describe` and `describe-many`, so custom vision instructions and slow local model budgets reach media-understanding providers such as Ollama, OpenAI, Google, and OpenRouter. Addresses #63700. Thanks @cedricjanssens.
- CLI/model probes: reject empty or whitespace-only `infer model run --prompt` values before calling local providers or the Gateway, so smoke checks do not spend provider calls on invalid turns. Fixes #73185. Thanks @iot2edge.
- Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah.
- Gateway/Windows: route no-listener restart handoffs through the Windows supervisor without leaving restart tokens in flight, so failed task scheduling can be retried and successful handoffs do not coalesce later restart requests. (#69056) Thanks @Thatgfsj.

View File

@@ -107,18 +107,18 @@ and the shared capability runtime before the provider request is made.
This table maps common inference tasks to the corresponding infer command.
| Task | Command | Notes |
| ----------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------- |
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
| Describe an image file | `openclaw infer image describe --file ./image.png --json` | `--model` must be an image-capable `<provider/model>` |
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
| Search the web | `openclaw infer web search --query "..." --json` | |
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
| Task | Command | Notes |
| ----------------------- | ------------------------------------------------------------------------ | ----------------------------------------------------- |
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
| Describe an image file | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `<provider/model>` |
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
| Search the web | `openclaw infer web search --query "..." --json` | |
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
## Behavior
@@ -176,8 +176,10 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000
openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json
openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json
openclaw infer image describe --file ./photo.jpg --json
openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json
openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json
openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json
openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --json
openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --prompt "Describe the image in one sentence" --timeout-ms 300000 --json
```
Notes:
@@ -208,6 +210,8 @@ Notes:
output paths. When `--output` is set, the final extension may follow the
provider's returned MIME type.
- For `image describe` and `image describe-many`, use `--prompt` to give the vision model a task-specific instruction such as OCR, comparison, UI inspection, or concise captioning.
- Use `--timeout-ms` with slow local vision models or cold Ollama starts.
- For `image describe`, `--model` must be an image-capable `<provider/model>`.
- For local Ollama vision models, pull the model first and set `OLLAMA_API_KEY` to any placeholder value, for example `ollama-local`. See [Ollama](/providers/ollama#vision-and-image-description).

View File

@@ -521,6 +521,32 @@ describe("capability cli", () => {
);
});
it("passes image describe prompts through media understanding", async () => {
await runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: [
"capability",
"image",
"describe",
"--file",
"photo.jpg",
"--prompt",
"Read the menu text",
"--timeout-ms",
"90000",
"--json",
],
});
expect(mocks.describeImageFile).toHaveBeenCalledWith(
expect.objectContaining({
filePath: expect.stringMatching(/photo\.jpg$/),
prompt: "Read the menu text",
timeoutMs: 90000,
}),
);
});
it("uses the explicit media-understanding provider for image describe model overrides", async () => {
await runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
@@ -532,6 +558,10 @@ describe("capability cli", () => {
"photo.jpg",
"--model",
"ollama/qwen2.5vl:7b",
"--prompt",
"Count visible buttons",
"--timeout-ms",
"120000",
"--json",
],
});
@@ -541,6 +571,8 @@ describe("capability cli", () => {
filePath: expect.stringMatching(/photo\.jpg$/),
provider: "ollama",
model: "qwen2.5vl:7b",
prompt: "Count visible buttons",
timeoutMs: 120000,
}),
);
expect(mocks.describeImageFile).not.toHaveBeenCalled();
@@ -552,6 +584,44 @@ describe("capability cli", () => {
);
});
it("passes describe-many prompts to each image", async () => {
await runRegisteredCli({
register: registerCapabilityCli as (program: Command) => void,
argv: [
"capability",
"image",
"describe-many",
"--file",
"a.jpg",
"--file",
"b.jpg",
"--prompt",
"Extract all visible labels",
"--timeout-ms",
"45000",
"--json",
],
});
expect(mocks.describeImageFile).toHaveBeenCalledTimes(2);
expect(mocks.describeImageFile).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
filePath: expect.stringMatching(/a\.jpg$/),
prompt: "Extract all visible labels",
timeoutMs: 45000,
}),
);
expect(mocks.describeImageFile).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
filePath: expect.stringMatching(/b\.jpg$/),
prompt: "Extract all visible labels",
timeoutMs: 45000,
}),
);
});
it("fails image describe when no description text is returned", async () => {
mocks.describeImageFile.mockResolvedValueOnce({
text: undefined,

View File

@@ -199,14 +199,14 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [
id: "image.describe",
description: "Describe one image file through media-understanding providers.",
transports: ["local"],
flags: ["--file", "--prompt", "--model", "--json"],
flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"],
resultShape: "normalized text output",
},
{
id: "image.describe-many",
description: "Describe multiple image files independently.",
transports: ["local"],
flags: ["--file", "--prompt", "--model", "--json"],
flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"],
resultShape: "one text output per file",
},
{
@@ -855,10 +855,13 @@ async function runImageDescribe(params: {
capability: "image.describe" | "image.describe-many";
files: string[];
model?: string;
prompt?: string;
timeoutMs?: number;
}) {
const cfg = getRuntimeConfig();
const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg));
const activeModel = requireProviderModelOverride(params.model);
const prompt = normalizeOptionalString(params.prompt);
const outputs = await Promise.all(
params.files.map(async (filePath) => {
const resolvedPath = path.resolve(filePath);
@@ -869,12 +872,15 @@ async function runImageDescribe(params: {
agentDir,
provider: activeModel.provider,
model: activeModel.model,
prompt: "Describe the image.",
prompt: prompt ?? "Describe the image.",
timeoutMs: params.timeoutMs,
})
: await describeImageFile({
filePath: resolvedPath,
cfg,
agentDir,
prompt,
timeoutMs: params.timeoutMs,
});
if (!result.text) {
throw new Error(`No description returned for image: ${resolvedPath}`);
@@ -1676,7 +1682,9 @@ export function registerCapabilityCli(program: Command) {
.command("describe")
.description("Describe one image file")
.requiredOption("--file <path>", "Image file")
.option("--prompt <text>", "Prompt hint")
.option("--model <provider/model>", "Model override")
.option("--timeout-ms <ms>", "Provider request timeout in milliseconds")
.option("--json", "Output JSON", false)
.action(async (opts) => {
await runCommandWithRuntime(defaultRuntime, async () => {
@@ -1684,6 +1692,8 @@ export function registerCapabilityCli(program: Command) {
capability: "image.describe",
files: [String(opts.file)],
model: opts.model as string | undefined,
prompt: opts.prompt as string | undefined,
timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"),
});
emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText);
});
@@ -1693,7 +1703,9 @@ export function registerCapabilityCli(program: Command) {
.command("describe-many")
.description("Describe multiple image files")
.requiredOption("--file <path>", "Image file", collectOption, [])
.option("--prompt <text>", "Prompt hint")
.option("--model <provider/model>", "Model override")
.option("--timeout-ms <ms>", "Provider request timeout in milliseconds")
.option("--json", "Output JSON", false)
.action(async (opts) => {
await runCommandWithRuntime(defaultRuntime, async () => {
@@ -1701,6 +1713,8 @@ export function registerCapabilityCli(program: Command) {
capability: "image.describe-many",
files: opts.file as string[],
model: opts.model as string | undefined,
prompt: opts.prompt as string | undefined,
timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"),
});
emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText);
});

View File

@@ -9,6 +9,8 @@ export type RunMediaUnderstandingFileParams = {
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
prompt?: string;
timeoutMs?: number;
};
export type RunMediaUnderstandingFileResult = {
@@ -24,6 +26,8 @@ export type DescribeImageFileParams = {
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
prompt?: string;
timeoutMs?: number;
};
export type DescribeImageFileWithModelParams = {

View File

@@ -102,6 +102,49 @@ describe("media-understanding runtime", () => {
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
});
it("passes per-request image prompts into media understanding config", async () => {
const output: MediaUnderstandingOutput = {
kind: "image.description",
attachmentIndex: 0,
provider: "vision-plugin",
model: "vision-v1",
text: "button count ok",
};
mocks.normalizeMediaAttachments.mockReturnValue([
{ index: 0, path: "/tmp/sample.jpg", mime: "image/jpeg" },
]);
mocks.runCapability.mockResolvedValue({
outputs: [output],
});
await describeImageFile({
filePath: "/tmp/sample.jpg",
mime: "image/jpeg",
cfg: {
tools: {
media: {
image: {
prompt: "default image prompt",
},
},
},
} as OpenClawConfig,
agentDir: "/tmp/agent",
prompt: "Count visible buttons",
timeoutMs: 90_000,
});
expect(mocks.runCapability).toHaveBeenCalledWith(
expect.objectContaining({
config: expect.objectContaining({
prompt: "Count visible buttons",
_requestPromptOverride: "Count visible buttons",
timeoutSeconds: 90,
}),
}),
);
});
it("surfaces the underlying provider failure when media understanding fails", async () => {
mocks.normalizeMediaAttachments.mockReturnValue([
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },

View File

@@ -50,12 +50,43 @@ function buildFileContext(params: { filePath: string; mime?: string }) {
export async function runMediaUnderstandingFile(
params: RunMediaUnderstandingFileParams,
): Promise<RunMediaUnderstandingFileResult> {
const requestPrompt = params.prompt?.trim();
const requestTimeoutSeconds =
typeof params.timeoutMs === "number" &&
Number.isFinite(params.timeoutMs) &&
params.timeoutMs > 0
? Math.ceil(params.timeoutMs / 1000)
: undefined;
const cfg =
requestPrompt || requestTimeoutSeconds !== undefined
? {
...params.cfg,
tools: {
...params.cfg.tools,
media: {
...params.cfg.tools?.media,
[params.capability]: {
...params.cfg.tools?.media?.[params.capability],
...(requestPrompt
? {
prompt: requestPrompt,
_requestPromptOverride: requestPrompt,
}
: {}),
...(requestTimeoutSeconds !== undefined
? { timeoutSeconds: requestTimeoutSeconds }
: {}),
},
},
},
}
: params.cfg;
const ctx = buildFileContext(params);
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
}
const config = params.cfg.tools?.media?.[params.capability];
const config = cfg.tools?.media?.[params.capability];
if (config?.enabled === false) {
return {
text: undefined,
@@ -65,16 +96,16 @@ export async function runMediaUnderstandingFile(
};
}
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const providerRegistry = buildProviderRegistry(undefined, cfg);
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: [path.dirname(params.filePath)],
ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
});
try {
const result = await runCapability({
capability: params.capability,
cfg: params.cfg,
cfg,
ctx,
attachments: cache,
media: attachments,