diff --git a/CHANGELOG.md b/CHANGELOG.md index 3be8648ee9c..58d22e1c702 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,7 @@ Docs: https://docs.openclaw.ai - Build/runtime: preserve staged bundled-plugin runtime dependency caches across source-checkout tsdown rebuilds, so local CLI and gateway-watch rebuilds no longer recreate large plugin dependency trees before starting. Refs #73205. Thanks @SymbolStar. - CLI/channels: list configured chat channel accounts from read-only setup metadata even when the standalone CLI has not loaded the runtime channel registry, so `openclaw channels list` shows Telegram accounts before auth providers. Fixes #73319 and #73322. Thanks @mlaihk. - CLI/model probes: keep `infer model run --gateway` raw by skipping prior session transcript, bootstrap context, context-engine assembly, tools, and bundled MCP servers, so local backends can be tested without full agent-context overhead. Fixes #73308. Thanks @ScientificProgrammer. +- CLI/image describe: pass `--prompt` and `--timeout-ms` through `infer image describe` and `describe-many`, so custom vision instructions and slow local model budgets reach media-understanding providers such as Ollama, OpenAI, Google, and OpenRouter. Addresses #63700. Thanks @cedricjanssens. - CLI/model probes: reject empty or whitespace-only `infer model run --prompt` values before calling local providers or the Gateway, so smoke checks do not spend provider calls on invalid turns. Fixes #73185. Thanks @iot2edge. - Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah. - Gateway/Windows: route no-listener restart handoffs through the Windows supervisor without leaving restart tokens in flight, so failed task scheduling can be retried and successful handoffs do not coalesce later restart requests. (#69056) Thanks @Thatgfsj. diff --git a/docs/cli/infer.md b/docs/cli/infer.md index 6b1b6f49616..c5644ad3015 100644 --- a/docs/cli/infer.md +++ b/docs/cli/infer.md @@ -107,18 +107,18 @@ and the shared capability runtime before the provider request is made. This table maps common inference tasks to the corresponding infer command. -| Task | Command | Notes | -| ----------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------- | -| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default | -| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file | -| Describe an image file | `openclaw infer image describe --file ./image.png --json` | `--model` must be an image-capable `` | -| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `` | -| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented | -| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` | -| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `` | -| Search the web | `openclaw infer web search --query "..." --json` | | -| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | | -| Create embeddings | `openclaw infer embedding create --text "..." --json` | | +| Task | Command | Notes | +| ----------------------- | ------------------------------------------------------------------------ | ----------------------------------------------------- | +| Run a text/model prompt | `openclaw infer model run --prompt "..." --json` | Uses the normal local path by default | +| Generate an image | `openclaw infer image generate --prompt "..." --json` | Use `image edit` when starting from an existing file | +| Describe an image file | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `` | +| Transcribe audio | `openclaw infer audio transcribe --file ./memo.m4a --json` | `--model` must be `` | +| Synthesize speech | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json` | `tts status` is gateway-oriented | +| Generate a video | `openclaw infer video generate --prompt "..." --json` | Supports provider hints such as `--resolution` | +| Describe a video file | `openclaw infer video describe --file ./clip.mp4 --json` | `--model` must be `` | +| Search the web | `openclaw infer web search --query "..." --json` | | +| Fetch a web page | `openclaw infer web fetch --url https://example.com --json` | | +| Create embeddings | `openclaw infer embedding create --text "..." --json` | | ## Behavior @@ -176,8 +176,10 @@ openclaw infer image generate --prompt "slow image backend" --timeout-ms 180000 openclaw infer image edit --file ./logo.png --model openai/gpt-image-1.5 --output-format png --background transparent --prompt "keep the logo, remove the background" --json openclaw infer image edit --file ./poster.png --prompt "make this a vertical story ad" --size 2160x3840 --aspect-ratio 9:16 --resolution 4K --json openclaw infer image describe --file ./photo.jpg --json +openclaw infer image describe --file ./receipt.jpg --prompt "Extract the merchant, date, and total" --json +openclaw infer image describe-many --file ./before.png --file ./after.png --prompt "Compare the screenshots and list visible UI changes" --json openclaw infer image describe --file ./ui-screenshot.png --model openai/gpt-4.1-mini --json -openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --json +openclaw infer image describe --file ./photo.jpg --model ollama/qwen2.5vl:7b --prompt "Describe the image in one sentence" --timeout-ms 300000 --json ``` Notes: @@ -208,6 +210,8 @@ Notes: output paths. When `--output` is set, the final extension may follow the provider's returned MIME type. +- For `image describe` and `image describe-many`, use `--prompt` to give the vision model a task-specific instruction such as OCR, comparison, UI inspection, or concise captioning. +- Use `--timeout-ms` with slow local vision models or cold Ollama starts. - For `image describe`, `--model` must be an image-capable ``. - For local Ollama vision models, pull the model first and set `OLLAMA_API_KEY` to any placeholder value, for example `ollama-local`. See [Ollama](/providers/ollama#vision-and-image-description). diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index e473a972a9a..5b618f5b6fa 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -521,6 +521,32 @@ describe("capability cli", () => { ); }); + it("passes image describe prompts through media understanding", async () => { + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "image", + "describe", + "--file", + "photo.jpg", + "--prompt", + "Read the menu text", + "--timeout-ms", + "90000", + "--json", + ], + }); + + expect(mocks.describeImageFile).toHaveBeenCalledWith( + expect.objectContaining({ + filePath: expect.stringMatching(/photo\.jpg$/), + prompt: "Read the menu text", + timeoutMs: 90000, + }), + ); + }); + it("uses the explicit media-understanding provider for image describe model overrides", async () => { await runRegisteredCli({ register: registerCapabilityCli as (program: Command) => void, @@ -532,6 +558,10 @@ describe("capability cli", () => { "photo.jpg", "--model", "ollama/qwen2.5vl:7b", + "--prompt", + "Count visible buttons", + "--timeout-ms", + "120000", "--json", ], }); @@ -541,6 +571,8 @@ describe("capability cli", () => { filePath: expect.stringMatching(/photo\.jpg$/), provider: "ollama", model: "qwen2.5vl:7b", + prompt: "Count visible buttons", + timeoutMs: 120000, }), ); expect(mocks.describeImageFile).not.toHaveBeenCalled(); @@ -552,6 +584,44 @@ describe("capability cli", () => { ); }); + it("passes describe-many prompts to each image", async () => { + await runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: [ + "capability", + "image", + "describe-many", + "--file", + "a.jpg", + "--file", + "b.jpg", + "--prompt", + "Extract all visible labels", + "--timeout-ms", + "45000", + "--json", + ], + }); + + expect(mocks.describeImageFile).toHaveBeenCalledTimes(2); + expect(mocks.describeImageFile).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ + filePath: expect.stringMatching(/a\.jpg$/), + prompt: "Extract all visible labels", + timeoutMs: 45000, + }), + ); + expect(mocks.describeImageFile).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + filePath: expect.stringMatching(/b\.jpg$/), + prompt: "Extract all visible labels", + timeoutMs: 45000, + }), + ); + }); + it("fails image describe when no description text is returned", async () => { mocks.describeImageFile.mockResolvedValueOnce({ text: undefined, diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts index ecbb033e3b4..bf028426a56 100644 --- a/src/cli/capability-cli.ts +++ b/src/cli/capability-cli.ts @@ -199,14 +199,14 @@ const CAPABILITY_METADATA: CapabilityMetadata[] = [ id: "image.describe", description: "Describe one image file through media-understanding providers.", transports: ["local"], - flags: ["--file", "--prompt", "--model", "--json"], + flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"], resultShape: "normalized text output", }, { id: "image.describe-many", description: "Describe multiple image files independently.", transports: ["local"], - flags: ["--file", "--prompt", "--model", "--json"], + flags: ["--file", "--prompt", "--model", "--timeout-ms", "--json"], resultShape: "one text output per file", }, { @@ -855,10 +855,13 @@ async function runImageDescribe(params: { capability: "image.describe" | "image.describe-many"; files: string[]; model?: string; + prompt?: string; + timeoutMs?: number; }) { const cfg = getRuntimeConfig(); const agentDir = resolveAgentDir(cfg, resolveDefaultAgentId(cfg)); const activeModel = requireProviderModelOverride(params.model); + const prompt = normalizeOptionalString(params.prompt); const outputs = await Promise.all( params.files.map(async (filePath) => { const resolvedPath = path.resolve(filePath); @@ -869,12 +872,15 @@ async function runImageDescribe(params: { agentDir, provider: activeModel.provider, model: activeModel.model, - prompt: "Describe the image.", + prompt: prompt ?? "Describe the image.", + timeoutMs: params.timeoutMs, }) : await describeImageFile({ filePath: resolvedPath, cfg, agentDir, + prompt, + timeoutMs: params.timeoutMs, }); if (!result.text) { throw new Error(`No description returned for image: ${resolvedPath}`); @@ -1676,7 +1682,9 @@ export function registerCapabilityCli(program: Command) { .command("describe") .description("Describe one image file") .requiredOption("--file ", "Image file") + .option("--prompt ", "Prompt hint") .option("--model ", "Model override") + .option("--timeout-ms ", "Provider request timeout in milliseconds") .option("--json", "Output JSON", false) .action(async (opts) => { await runCommandWithRuntime(defaultRuntime, async () => { @@ -1684,6 +1692,8 @@ export function registerCapabilityCli(program: Command) { capability: "image.describe", files: [String(opts.file)], model: opts.model as string | undefined, + prompt: opts.prompt as string | undefined, + timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"), }); emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText); }); @@ -1693,7 +1703,9 @@ export function registerCapabilityCli(program: Command) { .command("describe-many") .description("Describe multiple image files") .requiredOption("--file ", "Image file", collectOption, []) + .option("--prompt ", "Prompt hint") .option("--model ", "Model override") + .option("--timeout-ms ", "Provider request timeout in milliseconds") .option("--json", "Output JSON", false) .action(async (opts) => { await runCommandWithRuntime(defaultRuntime, async () => { @@ -1701,6 +1713,8 @@ export function registerCapabilityCli(program: Command) { capability: "image.describe-many", files: opts.file as string[], model: opts.model as string | undefined, + prompt: opts.prompt as string | undefined, + timeoutMs: parseOptionalFiniteNumber(opts.timeoutMs, "--timeout-ms"), }); emitJsonOrText(defaultRuntime, Boolean(opts.json), result, formatEnvelopeForText); }); diff --git a/src/media-understanding/runtime-types.ts b/src/media-understanding/runtime-types.ts index d508744925b..3f22b226e4e 100644 --- a/src/media-understanding/runtime-types.ts +++ b/src/media-understanding/runtime-types.ts @@ -9,6 +9,8 @@ export type RunMediaUnderstandingFileParams = { agentDir?: string; mime?: string; activeModel?: ActiveMediaModel; + prompt?: string; + timeoutMs?: number; }; export type RunMediaUnderstandingFileResult = { @@ -24,6 +26,8 @@ export type DescribeImageFileParams = { agentDir?: string; mime?: string; activeModel?: ActiveMediaModel; + prompt?: string; + timeoutMs?: number; }; export type DescribeImageFileWithModelParams = { diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts index 0b63f8d7041..f773d6e0011 100644 --- a/src/media-understanding/runtime.test.ts +++ b/src/media-understanding/runtime.test.ts @@ -102,6 +102,49 @@ describe("media-understanding runtime", () => { expect(mocks.cleanup).toHaveBeenCalledTimes(1); }); + it("passes per-request image prompts into media understanding config", async () => { + const output: MediaUnderstandingOutput = { + kind: "image.description", + attachmentIndex: 0, + provider: "vision-plugin", + model: "vision-v1", + text: "button count ok", + }; + mocks.normalizeMediaAttachments.mockReturnValue([ + { index: 0, path: "/tmp/sample.jpg", mime: "image/jpeg" }, + ]); + mocks.runCapability.mockResolvedValue({ + outputs: [output], + }); + + await describeImageFile({ + filePath: "/tmp/sample.jpg", + mime: "image/jpeg", + cfg: { + tools: { + media: { + image: { + prompt: "default image prompt", + }, + }, + }, + } as OpenClawConfig, + agentDir: "/tmp/agent", + prompt: "Count visible buttons", + timeoutMs: 90_000, + }); + + expect(mocks.runCapability).toHaveBeenCalledWith( + expect.objectContaining({ + config: expect.objectContaining({ + prompt: "Count visible buttons", + _requestPromptOverride: "Count visible buttons", + timeoutSeconds: 90, + }), + }), + ); + }); + it("surfaces the underlying provider failure when media understanding fails", async () => { mocks.normalizeMediaAttachments.mockReturnValue([ { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" }, diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index 3efc20844a7..65896ace322 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -50,12 +50,43 @@ function buildFileContext(params: { filePath: string; mime?: string }) { export async function runMediaUnderstandingFile( params: RunMediaUnderstandingFileParams, ): Promise { + const requestPrompt = params.prompt?.trim(); + const requestTimeoutSeconds = + typeof params.timeoutMs === "number" && + Number.isFinite(params.timeoutMs) && + params.timeoutMs > 0 + ? Math.ceil(params.timeoutMs / 1000) + : undefined; + const cfg = + requestPrompt || requestTimeoutSeconds !== undefined + ? { + ...params.cfg, + tools: { + ...params.cfg.tools, + media: { + ...params.cfg.tools?.media, + [params.capability]: { + ...params.cfg.tools?.media?.[params.capability], + ...(requestPrompt + ? { + prompt: requestPrompt, + _requestPromptOverride: requestPrompt, + } + : {}), + ...(requestTimeoutSeconds !== undefined + ? { timeoutSeconds: requestTimeoutSeconds } + : {}), + }, + }, + }, + } + : params.cfg; const ctx = buildFileContext(params); const attachments = normalizeMediaAttachments(ctx); if (attachments.length === 0) { return { text: undefined }; } - const config = params.cfg.tools?.media?.[params.capability]; + const config = cfg.tools?.media?.[params.capability]; if (config?.enabled === false) { return { text: undefined, @@ -65,16 +96,16 @@ export async function runMediaUnderstandingFile( }; } - const providerRegistry = buildProviderRegistry(undefined, params.cfg); + const providerRegistry = buildProviderRegistry(undefined, cfg); const cache = createMediaAttachmentCache(attachments, { localPathRoots: [path.dirname(params.filePath)], - ssrfPolicy: params.cfg.tools?.web?.fetch?.ssrfPolicy, + ssrfPolicy: cfg.tools?.web?.fetch?.ssrfPolicy, }); try { const result = await runCapability({ capability: params.capability, - cfg: params.cfg, + cfg, ctx, attachments: cache, media: attachments,