mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:40:44 +00:00
fix(cli): wire image describe prompt options
This commit is contained in:
@@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Build/runtime: preserve staged bundled-plugin runtime dependency caches across source-checkout tsdown rebuilds, so local CLI and gateway-watch rebuilds no longer recreate large plugin dependency trees before starting. Refs #73205. Thanks @SymbolStar.
|
||||
- CLI/channels: list configured chat channel accounts from read-only setup metadata even when the standalone CLI has not loaded the runtime channel registry, so `openclaw channels list` shows Telegram accounts before auth providers. Fixes #73319 and #73322. Thanks @mlaihk.
|
||||
- CLI/model probes: keep `infer model run --gateway` raw by skipping prior session transcript, bootstrap context, context-engine assembly, tools, and bundled MCP servers, so local backends can be tested without full agent-context overhead. Fixes #73308. Thanks @ScientificProgrammer.
|
||||
- CLI/image describe: pass `--prompt` and `--timeout-ms` through `infer image describe` and `describe-many`, so custom vision instructions and slow local model budgets reach media-understanding providers such as Ollama, OpenAI, Google, and OpenRouter. Addresses #63700. Thanks @cedricjanssens.
|
||||
- CLI/model probes: reject empty or whitespace-only `infer model run --prompt` values before calling local providers or the Gateway, so smoke checks do not spend provider calls on invalid turns. Fixes #73185. Thanks @iot2edge.
|
||||
- Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah.
|
||||
- Gateway/Windows: route no-listener restart handoffs through the Windows supervisor without leaving restart tokens in flight, so failed task scheduling can be retried and successful handoffs do not coalesce later restart requests. (#69056) Thanks @Thatgfsj.
|
||||
|
||||
@@ -107,18 +107,18 @@ and the shared capability runtime before the provider request is made.
|
||||
|
||||
This table maps common inference tasks to the corresponding infer command.
|
||||
|
||||
| Task | Command | Notes |
|
||||
| ----------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
|
||||
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
|
||||
| Describe an image file | `openclaw infer image describe --file ./image.png --json` | `--model` must be an image-capable `<provider/model>` |
|
||||
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
|
||||
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
|
||||
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
|
||||
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
|
||||
| Search the web | `openclaw infer web search --query "..." --json` | |
|
||||
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
|
||||
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
|
||||
| Task | Command | Notes |
|
||||
| ----------------------- | ------------------------------------------------------------------------ | ----------------------------------------------------- |
|
||||
| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default |
|
||||
| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file |
|
||||
| Describe an image file | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `<provider/model>` |
|
||||
| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `<provider/model>` |
|
||||
| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented |
|
||||
| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` |
|
||||
| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `<provider/model>` |
|
||||
| Search the web | `openclaw infer web search --query "..." --json` | |
|
||||
| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | |
|
||||
| Create embeddings | `openclaw infer embedding create --text "..." --json` | |
|
||||
|
||||
## Behavior
|
||||
|
||||
@@ -176,8 +176,10 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000
|
||||
openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json
|
||||
openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json
|
||||
openclaw infer image describe --file ./photo.jpg --json
|
||||
openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json
|
||||
openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json
|
||||
openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json
|
||||
openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --json
|
||||
openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --prompt "Describe the image in one sentence" --timeout-ms 300000 --json
|
||||
```
|
||||
|
||||
Notes:
|
||||
@@ -208,6 +210,8 @@ Notes:
|
||||
output paths. When `--output` is set, the final extension may follow the
|
||||
provider's returned MIME type.
|
||||
|
||||
- For `image describe` and `image describe-many`, use `--prompt` to give the vision model a task-specific instruction such as OCR, comparison, UI inspection, or concise captioning.
|
||||
- Use `--timeout-ms` with slow local vision models or cold Ollama starts.
|
||||
- For `image describe`, `--model` must be an image-capable `<provider/model>`.
|
||||
- For local Ollama vision models, pull the model first and set `OLLAMA_API_KEY` to any placeholder value, for example `ollama-local`. See [Ollama](/providers/ollama#vision-and-image-description).
|
||||
|
||||
|
||||
@@ -521,6 +521,32 @@ describe("capability cli", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("passes image describe prompts through media understanding", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: [
|
||||
"capability",
|
||||
"image",
|
||||
"describe",
|
||||
"--file",
|
||||
"photo.jpg",
|
||||
"--prompt",
|
||||
"Read the menu text",
|
||||
"--timeout-ms",
|
||||
"90000",
|
||||
"--json",
|
||||
],
|
||||
});
|
||||
|
||||
expect(mocks.describeImageFile).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
filePath: expect.stringMatching(/photo\.jpg$/),
|
||||
prompt: "Read the menu text",
|
||||
timeoutMs: 90000,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("uses the explicit media-understanding provider for image describe model overrides", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
@@ -532,6 +558,10 @@ describe("capability cli", () => {
|
||||
"photo.jpg",
|
||||
"--model",
|
||||
"ollama/qwen2.5vl:7b",
|
||||
"--prompt",
|
||||
"Count visible buttons",
|
||||
"--timeout-ms",
|
||||
"120000",
|
||||
"--json",
|
||||
],
|
||||
});
|
||||
@@ -541,6 +571,8 @@ describe("capability cli", () => {
|
||||
filePath: expect.stringMatching(/photo\.jpg$/),
|
||||
provider: "ollama",
|
||||
model: "qwen2.5vl:7b",
|
||||
prompt: "Count visible buttons",
|
||||
timeoutMs: 120000,
|
||||
}),
|
||||
);
|
||||
expect(mocks.describeImageFile).not.toHaveBeenCalled();
|
||||
@@ -552,6 +584,44 @@ describe("capability cli", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("passes describe-many prompts to each image", async () => {
|
||||
await runRegisteredCli({
|
||||
register: registerCapabilityCli as (program: Command) => void,
|
||||
argv: [
|
||||
"capability",
|
||||
"image",
|
||||
"describe-many",
|
||||
"--file",
|
||||
"a.jpg",
|
||||
"--file",
|
||||
"b.jpg",
|
||||
"--prompt",
|
||||
"Extract all visible labels",
|
||||
"--timeout-ms",
|
||||
"45000",
|
||||
"--json",
|
||||
],
|
||||
});
|
||||
|
||||
expect(mocks.describeImageFile).toHaveBeenCalledTimes(2);
|
||||
expect(mocks.describeImageFile).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({
|
||||
filePath: expect.stringMatching(/a\.jpg$/),
|
||||
prompt: "Extract all visible labels",
|
||||
timeoutMs: 45000,
|
||||
}),
|
||||
);
|
||||
expect(mocks.describeImageFile).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({
|
||||
filePath: expect.stringMatching(/b\.jpg$/),
|
||||
prompt: "Extract all visible labels",
|
||||
timeoutMs: 45000,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("fails image describe when no description text is returned", async () => {
|
||||
mocks.describeImageFile.mockResolvedValueOnce({
|
||||
text: undefined,
|
||||
|
||||
@@ -199,14 +199,14 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [
|
||||
id: "image.describe",
|
||||
description: "Describe one image file through media-understanding providers.",
|
||||
transports: ["local"],
|
||||
flags: ["--file", "--prompt", "--model", "--json"],
|
||||
flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"],
|
||||
resultShape: "normalized text output",
|
||||
},
|
||||
{
|
||||
id: "image.describe-many",
|
||||
description: "Describe multiple image files independently.",
|
||||
transports: ["local"],
|
||||
flags: ["--file", "--prompt", "--model", "--json"],
|
||||
flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"],
|
||||
resultShape: "one text output per file",
|
||||
},
|
||||
{
|
||||
@@ -855,10 +855,13 @@ async function runImageDescribe(params: {
|
||||
capability: "image.describe" | "image.describe-many";
|
||||
files: string[];
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
timeoutMs?: number;
|
||||
}) {
|
||||
const cfg = getRuntimeConfig();
|
||||
const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg));
|
||||
const activeModel = requireProviderModelOverride(params.model);
|
||||
const prompt = normalizeOptionalString(params.prompt);
|
||||
const outputs = await Promise.all(
|
||||
params.files.map(async (filePath) => {
|
||||
const resolvedPath = path.resolve(filePath);
|
||||
@@ -869,12 +872,15 @@ async function runImageDescribe(params: {
|
||||
agentDir,
|
||||
provider: activeModel.provider,
|
||||
model: activeModel.model,
|
||||
prompt: "Describe the image.",
|
||||
prompt: prompt ?? "Describe the image.",
|
||||
timeoutMs: params.timeoutMs,
|
||||
})
|
||||
: await describeImageFile({
|
||||
filePath: resolvedPath,
|
||||
cfg,
|
||||
agentDir,
|
||||
prompt,
|
||||
timeoutMs: params.timeoutMs,
|
||||
});
|
||||
if (!result.text) {
|
||||
throw new Error(`No description returned for image: ${resolvedPath}`);
|
||||
@@ -1676,7 +1682,9 @@ export function registerCapabilityCli(program: Command) {
|
||||
.command("describe")
|
||||
.description("Describe one image file")
|
||||
.requiredOption("--file <path>", "Image file")
|
||||
.option("--prompt <text>", "Prompt hint")
|
||||
.option("--model <provider/model>", "Model override")
|
||||
.option("--timeout-ms <ms>", "Provider request timeout in milliseconds")
|
||||
.option("--json", "Output JSON", false)
|
||||
.action(async (opts) => {
|
||||
await runCommandWithRuntime(defaultRuntime, async () => {
|
||||
@@ -1684,6 +1692,8 @@ export function registerCapabilityCli(program: Command) {
|
||||
capability: "image.describe",
|
||||
files: [String(opts.file)],
|
||||
model: opts.model as string | undefined,
|
||||
prompt: opts.prompt as string | undefined,
|
||||
timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"),
|
||||
});
|
||||
emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText);
|
||||
});
|
||||
@@ -1693,7 +1703,9 @@ export function registerCapabilityCli(program: Command) {
|
||||
.command("describe-many")
|
||||
.description("Describe multiple image files")
|
||||
.requiredOption("--file <path>", "Image file", collectOption, [])
|
||||
.option("--prompt <text>", "Prompt hint")
|
||||
.option("--model <provider/model>", "Model override")
|
||||
.option("--timeout-ms <ms>", "Provider request timeout in milliseconds")
|
||||
.option("--json", "Output JSON", false)
|
||||
.action(async (opts) => {
|
||||
await runCommandWithRuntime(defaultRuntime, async () => {
|
||||
@@ -1701,6 +1713,8 @@ export function registerCapabilityCli(program: Command) {
|
||||
capability: "image.describe-many",
|
||||
files: opts.file as string[],
|
||||
model: opts.model as string | undefined,
|
||||
prompt: opts.prompt as string | undefined,
|
||||
timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"),
|
||||
});
|
||||
emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText);
|
||||
});
|
||||
|
||||
@@ -9,6 +9,8 @@ export type RunMediaUnderstandingFileParams = {
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
prompt?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
|
||||
export type RunMediaUnderstandingFileResult = {
|
||||
@@ -24,6 +26,8 @@ export type DescribeImageFileParams = {
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
prompt?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
|
||||
export type DescribeImageFileWithModelParams = {
|
||||
|
||||
@@ -102,6 +102,49 @@ describe("media-understanding runtime", () => {
|
||||
expect(mocks.cleanup).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("passes per-request image prompts into media understanding config", async () => {
|
||||
const output: MediaUnderstandingOutput = {
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
provider: "vision-plugin",
|
||||
model: "vision-v1",
|
||||
text: "button count ok",
|
||||
};
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, path: "/tmp/sample.jpg", mime: "image/jpeg" },
|
||||
]);
|
||||
mocks.runCapability.mockResolvedValue({
|
||||
outputs: [output],
|
||||
});
|
||||
|
||||
await describeImageFile({
|
||||
filePath: "/tmp/sample.jpg",
|
||||
mime: "image/jpeg",
|
||||
cfg: {
|
||||
tools: {
|
||||
media: {
|
||||
image: {
|
||||
prompt: "default image prompt",
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
prompt: "Count visible buttons",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(mocks.runCapability).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
config: expect.objectContaining({
|
||||
prompt: "Count visible buttons",
|
||||
_requestPromptOverride: "Count visible buttons",
|
||||
timeoutSeconds: 90,
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("surfaces the underlying provider failure when media understanding fails", async () => {
|
||||
mocks.normalizeMediaAttachments.mockReturnValue([
|
||||
{ index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
|
||||
|
||||
@@ -50,12 +50,43 @@ function buildFileContext(params: { filePath: string; mime?: string }) {
|
||||
export async function runMediaUnderstandingFile(
|
||||
params: RunMediaUnderstandingFileParams,
|
||||
): Promise<RunMediaUnderstandingFileResult> {
|
||||
const requestPrompt = params.prompt?.trim();
|
||||
const requestTimeoutSeconds =
|
||||
typeof params.timeoutMs === "number" &&
|
||||
Number.isFinite(params.timeoutMs) &&
|
||||
params.timeoutMs > 0
|
||||
? Math.ceil(params.timeoutMs / 1000)
|
||||
: undefined;
|
||||
const cfg =
|
||||
requestPrompt || requestTimeoutSeconds !== undefined
|
||||
? {
|
||||
...params.cfg,
|
||||
tools: {
|
||||
...params.cfg.tools,
|
||||
media: {
|
||||
...params.cfg.tools?.media,
|
||||
[params.capability]: {
|
||||
...params.cfg.tools?.media?.[params.capability],
|
||||
...(requestPrompt
|
||||
? {
|
||||
prompt: requestPrompt,
|
||||
_requestPromptOverride: requestPrompt,
|
||||
}
|
||||
: {}),
|
||||
...(requestTimeoutSeconds !== undefined
|
||||
? { timeoutSeconds: requestTimeoutSeconds }
|
||||
: {}),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
: params.cfg;
|
||||
const ctx = buildFileContext(params);
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
if (attachments.length === 0) {
|
||||
return { text: undefined };
|
||||
}
|
||||
const config = params.cfg.tools?.media?.[params.capability];
|
||||
const config = cfg.tools?.media?.[params.capability];
|
||||
if (config?.enabled === false) {
|
||||
return {
|
||||
text: undefined,
|
||||
@@ -65,16 +96,16 @@ export async function runMediaUnderstandingFile(
|
||||
};
|
||||
}
|
||||
|
||||
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
|
||||
const providerRegistry = buildProviderRegistry(undefined, cfg);
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: [path.dirname(params.filePath)],
|
||||
ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy,
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: params.capability,
|
||||
cfg: params.cfg,
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media: attachments,
|
||||
|
||||
Reference in New Issue
Block a user