fix(cli): support image files in model probes

2026-05-06 09:50:42 +00:00 · 2026-04-28 18:50:41 +01:00
parent 3f780bb27d
commit 53d34e7cde
5 changed files with 239 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,12 +29,11 @@ Docs: https://docs.openclaw.ai
 - Memory-core/dreaming: retry managed dreaming cron registration after startup when the cron service is not reachable yet, so the scheduled Memory Dreaming Promotion sweep recovers without waiting for heartbeat traffic. Fixes #72841. Thanks @amknight.
 - Acpx/runtime: validate the runtime session mode at the `AcpxRuntime.ensureSession` wrapper boundary so callers that pass anything other than `persistent` or `oneshot` get a clear `ACP_INVALID_RUNTIME_OPTION` error instead of silently round-tripping through the encoded handle as a default `persistent` mode and later throwing `SessionResumeRequiredError`. Investigation context: #73071. (#73548) Thanks @amknight.
 - CLI/infer: keep web-search fallback on missing provider API keys, preserve structured validation errors from the selected provider, and let per-request image describe prompts override configured media-entry prompts. (#63263) Thanks @Spolen23.
+- CLI/model probes: add repeatable image `--file` inputs to `infer model run` for local and gateway multimodal model smokes, so vision models such as Ollama Qwen VL and Gemini can be tested through the raw model-probe surface. Fixes #63700. Thanks @cedricjanssens.
+- CLI/image describe: pass `--prompt` and `--timeout-ms` through `infer image describe` and `describe-many`, so custom vision instructions and slow local model budgets reach media-understanding providers such as Ollama, OpenAI, Google, and OpenRouter. Refs #63700. Thanks @cedricjanssens.
 - WhatsApp/Web: pass explicit Baileys socket timings into every WhatsApp Web socket and expose `web.whatsapp.*` keepalive, connect, and query timeout settings so unstable networks can avoid repeated 408 disconnect and opening-handshake timeout loops. Fixes #56365. (#73580) Thanks @velvet-shark.
 - Channels/Telegram: persist native command metadata on target sessions so topic, helper, and ACP-bound slash commands keep their session metadata attached to the routed conversation. (#57548) Thanks @GaosCode.
 - Channels/native commands: keep validated native slash command replies visible in group chats while preserving explicit owner allowlists for command authorization. (#73672) Thanks @obviyus.
-
-### Fixes
-
 - Auto-reply/session: carry the tail of user/assistant turns into the freshly-rotated transcript on silent in-reply session resets (compaction failure, role-ordering conflict) so direct-chat continuity survives the rebind. Fixes #70853. (#70898) Thanks @neeravmakwana.

 ## 2026.4.27
--- a/docs/cli/infer.md
+++ b/docs/cli/infer.md
@@ -107,18 +107,19 @@ and the shared capability runtime before the provider request is made.

 This table maps common inference tasks to the corresponding infer command.

-| Task                    | Command                                                                  | Notes                                                 |
-| ----------------------- | ------------------------------------------------------------------------ | ----------------------------------------------------- |
-| Run a text/model prompt | `openclaw infer model run --prompt "..." --json`                         | Uses the normal local path by default                 |
-| Generate an image       | `openclaw infer image generate --prompt "..." --json`                    | Use `image edit` when starting from an existing file  |
-| Describe an image file  | `openclaw infer image describe --file ./image.png --prompt "..." --json` | `--model` must be an image-capable `<provider/model>` |
-| Transcribe audio        | `openclaw infer audio transcribe --file ./memo.m4a --json`               | `--model` must be `<provider/model>`                  |
-| Synthesize speech       | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`   | `tts status` is gateway-oriented                      |
-| Generate a video        | `openclaw infer video generate --prompt "..." --json`                    | Supports provider hints such as `--resolution`        |
-| Describe a video file   | `openclaw infer video describe --file ./clip.mp4 --json`                 | `--model` must be `<provider/model>`                  |
-| Search the web          | `openclaw infer web search --query "..." --json`                         |                                                       |
-| Fetch a web page        | `openclaw infer web fetch --url https://example.com --json`              |                                                       |
-| Create embeddings       | `openclaw infer embedding create --text "..." --json`                    |                                                       |
+| Task                         | Command                                                                                       | Notes                                                 |
+| ---------------------------- | --------------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| Run a text/model prompt      | `openclaw infer model run --prompt "..." --json`                                              | Uses the normal local path by default                 |
+| Run a model prompt on images | `openclaw infer model run --prompt "Describe this" --file ./image.png --model provider/model` | Repeat `--file` for multiple image inputs             |
+| Generate an image            | `openclaw infer image generate --prompt "..." --json`                                         | Use `image edit` when starting from an existing file  |
+| Describe an image file       | `openclaw infer image describe --file ./image.png --prompt "..." --json`                      | `--model` must be an image-capable `<provider/model>` |
+| Transcribe audio             | `openclaw infer audio transcribe --file ./memo.m4a --json`                                    | `--model` must be `<provider/model>`                  |
+| Synthesize speech            | `openclaw infer tts convert --text "..." --output ./speech.mp3 --json`                        | `tts status` is gateway-oriented                      |
+| Generate a video             | `openclaw infer video generate --prompt "..." --json`                                         | Supports provider hints such as `--resolution`        |
+| Describe a video file        | `openclaw infer video describe --file ./clip.mp4 --json`                                      | `--model` must be `<provider/model>`                  |
+| Search the web               | `openclaw infer web search --query "..." --json`                                              |                                                       |
+| Fetch a web page             | `openclaw infer web fetch --url https://example.com --json`                                   |                                                       |
+| Create embeddings            | `openclaw infer embedding create --text "..." --json`                                         |                                                       |

 ## Behavior

@@ -131,7 +132,9 @@ This table maps common inference tasks to the corresponding infer command.
 - Gateway-managed state commands default to gateway.
 - The normal local path does not require the gateway to be running.
 - Local `model run` is a lean one-shot provider completion. It resolves the configured agent model and auth, but does not start a chat-agent turn, load tools, or open bundled MCP servers.
- `model run --gateway` exercises Gateway routing, saved auth, provider selection, and the embedded runtime, but still runs as a raw model probe: it sends the supplied prompt without prior session transcript, bootstrap/AGENTS context, context-engine assembly, tools, or bundled MCP servers.
+- `model run --file` accepts image files, detects their MIME type, and sends them with the supplied prompt to the selected model. Repeat `--file` for multiple images.
+- `model run --file` rejects non-image inputs. Use `infer audio transcribe` for audio files and `infer video describe` for video files.
+- `model run --gateway` exercises Gateway routing, saved auth, provider selection, and the embedded runtime, but still runs as a raw model probe: it sends the supplied prompt and any image attachments without prior session transcript, bootstrap/AGENTS context, context-engine assembly, tools, or bundled MCP servers.

 ## Model

@@ -139,7 +142,8 @@ Use `model` for provider-backed text inference and model/provider inspection.

 ```bash
 openclaw infer model run --prompt "Reply with exactly: smoke-ok" --json
-openclaw infer model run --prompt "Summarize this changelog entry" --provider openai --json
+openclaw infer model run --prompt "Summarize this changelog entry" --model openai/gpt-5.4 --json
+openclaw infer model run --prompt "Describe this image in one sentence" --file ./photo.jpg --model google/gemini-2.5-flash --json
 openclaw infer model providers --json
 openclaw infer model inspect --name gpt-5.5 --json
 ```
@@ -154,11 +158,15 @@ openclaw infer model run --local --model google/gemini-2.5-flash --prompt "Reply
 openclaw infer model run --local --model groq/llama-3.1-8b-instant --prompt "Reply with exactly: pong" --json
 openclaw infer model run --local --model mistral/mistral-small-latest --prompt "Reply with exactly: pong" --json
 openclaw infer model run --local --model openai/gpt-4.1 --prompt "Reply with exactly: pong" --json
+openclaw infer model run --local --model ollama/qwen2.5vl:7b --prompt "Describe this image." --file ./photo.jpg --json
 ```

 Notes:

 - Local `model run` is the narrowest CLI smoke for provider/model/auth health because it sends only the supplied prompt to the selected model.
+- Local `model run --file` keeps that lean path and attaches image content directly to the single user message. Common image files such as PNG, JPEG, and WebP work when their MIME type is detected as `image/*`; unsupported or unrecognized files fail before the provider is called.
+- `model run --file` is best when you want to test the selected multimodal text model directly. Use `infer image describe` when you want OpenClaw's image-understanding provider selection and default image-model routing.
+- The selected model must support image input; text-only models may reject the request at the provider layer.
 - `model run --prompt` must contain non-whitespace text; empty prompts are rejected before local providers or the Gateway are called.
 - Local `model run` exits non-zero when the provider returns no text output, so unreachable local providers and empty completions do not look like successful probes.
 - Use `model run --gateway` when you need to test Gateway routing, agent-runtime setup, or Gateway-managed provider state while keeping the model input raw. Use `openclaw agent` or chat surfaces when you want the full agent context, tools, memory, and session transcript.
--- a/docs/providers/ollama.md
+++ b/docs/providers/ollama.md
@@ -215,6 +215,25 @@ transport, but it does not start a chat-agent turn or load MCP/tool context. If
 this succeeds while normal agent replies fail, troubleshoot the model's agent
 prompt/tool capacity next.

+For a narrow vision-model smoke test on the same lean path, add one or more
+image files to `infer model run`. This sends the prompt and image directly to
+the selected Ollama vision model without loading chat tools, memory, or prior
+session context:
+
+```bash
+OLLAMA_API_KEY=ollama-local \
+  openclaw infer model run \
+    --local \
+    --model ollama/qwen2.5vl:7b \
+    --prompt "Describe this image in one sentence." \
+    --file ./photo.jpg \
+    --json
+```
+
+`model run --file` accepts files detected as `image/*`, including common PNG,
+JPEG, and WebP inputs. Non-image files are rejected before Ollama is called.
+For speech recognition, use `openclaw infer audio transcribe` instead.
+
 When you switch a conversation with `/model ollama/<model>`, OpenClaw treats
 that as an exact user selection. If the configured Ollama `baseUrl` is
 unreachable, the next reply fails with the provider error instead of silently
@@ -269,6 +288,8 @@ openclaw infer image describe \

 `--model` must be a full `<provider/model>` ref. When it is set, `openclaw infer image describe` runs that model directly instead of skipping description because the model supports native vision.

+Use `infer image describe` when you want OpenClaw's image-understanding provider flow, configured `agents.defaults.imageModel`, and image-description output shape. Use `infer model run --file` when you want a raw multimodal model probe with a custom prompt and one or more images.
+
 To make Ollama the default image-understanding model for inbound media, configure `agents.defaults.imageModel`:

 ```json5
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -6,6 +6,9 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { runRegisteredCli } from "../test-utils/command-runner.js";
 import { registerCapabilityCli } from "./capability-cli.js";

+const PNG_1X1_BASE64 =
+  "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+yf7kAAAAASUVORK5CYII=";
+
 const mocks = vi.hoisted(() => ({
  runtime: {
    log: vi.fn(),
@@ -419,6 +422,117 @@ describe("capability cli", () => {
    );
  });

+  it("passes image files to local model probes", async () => {
+    const tempInput = path.join(os.tmpdir(), `openclaw-model-run-image-${Date.now()}.png`);
+    await fs.writeFile(tempInput, Buffer.from(PNG_1X1_BASE64, "base64"));
+
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "model",
+        "run",
+        "--prompt",
+        "describe this",
+        "--file",
+        tempInput,
+        "--json",
+      ],
+    });
+
+    expect(mocks.completeWithPreparedSimpleCompletionModel).toHaveBeenCalledWith(
+      expect.objectContaining({
+        context: {
+          messages: [
+            expect.objectContaining({
+              role: "user",
+              content: [
+                { type: "text", text: "describe this" },
+                { type: "image", data: PNG_1X1_BASE64, mimeType: "image/png" },
+              ],
+            }),
+          ],
+        },
+      }),
+    );
+    expect(mocks.runtime.writeJson).toHaveBeenCalledWith(
+      expect.objectContaining({
+        inputs: [
+          expect.objectContaining({
+            path: tempInput,
+            mimeType: "image/png",
+          }),
+        ],
+      }),
+    );
+  });
+
+  it("passes image files to gateway model probes as attachments", async () => {
+    const tempInput = path.join(os.tmpdir(), `openclaw-model-run-gateway-image-${Date.now()}.png`);
+    await fs.writeFile(tempInput, Buffer.from(PNG_1X1_BASE64, "base64"));
+
+    await runRegisteredCli({
+      register: registerCapabilityCli as (program: Command) => void,
+      argv: [
+        "capability",
+        "model",
+        "run",
+        "--prompt",
+        "describe this",
+        "--file",
+        tempInput,
+        "--gateway",
+        "--json",
+      ],
+    });
+
+    expect(mocks.callGateway).toHaveBeenCalledWith(
+      expect.objectContaining({
+        method: "agent",
+        params: expect.objectContaining({
+          message: "describe this",
+          attachments: [
+            {
+              type: "image",
+              fileName: path.basename(tempInput),
+              mimeType: "image/png",
+              content: PNG_1X1_BASE64,
+            },
+          ],
+          modelRun: true,
+          promptMode: "none",
+        }),
+      }),
+    );
+  });
+
+  it("rejects non-image files for model probes", async () => {
+    const tempInput = path.join(os.tmpdir(), `openclaw-model-run-audio-${Date.now()}.mp3`);
+    await fs.writeFile(tempInput, Buffer.from("not really audio"));
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: [
+          "capability",
+          "model",
+          "run",
+          "--prompt",
+          "transcribe this",
+          "--file",
+          tempInput,
+          "--json",
+        ],
+      }),
+    ).rejects.toThrow("exit 1");
+
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("Only image files are supported"),
+    );
+    expect(mocks.completeWithPreparedSimpleCompletionModel).not.toHaveBeenCalled();
+    expect(mocks.callGateway).not.toHaveBeenCalled();
+  });
+
  it("fails local model probes when the provider returns no text output", async () => {
    mocks.completeWithPreparedSimpleCompletionModel.mockResolvedValueOnce({
      content: [],
--- a/src/cli/capability-cli.ts
+++ b/src/cli/capability-cli.ts
@@ -104,6 +104,7 @@ type CapabilityEnvelope = {
  provider?: string;
  model?: string;
  attempts: Array<Record<string, unknown>>;
+  inputs?: Array<Record<string, unknown>>;
  outputs: Array<Record<string, unknown>>;
  ignoredOverrides?: Array<Record<string, unknown>>;
  error?: string;
@@ -112,9 +113,9 @@ type CapabilityEnvelope = {
 const CAPABILITY_METADATA: CapabilityMetadata[] = [
  {
    id: "model.run",
-    description: "Run a one-shot text inference turn through the selected model provider.",
+    description: "Run a one-shot inference turn through the selected model provider.",
    transports: ["local", "gateway"],
-    flags: ["--prompt", "--model", "--local", "--gateway", "--json"],
+    flags: ["--prompt", "--file", "--model", "--local", "--gateway", "--json"],
    resultShape: "normalized payloads plus provider/model attribution",
  },
  {
@@ -584,13 +585,62 @@ function requireModelRunPrompt(value: unknown): string {
  return value;
 }

+type ModelRunImageFile = {
+  path: string;
+  fileName: string;
+  mimeType: string;
+  data: string;
+};
+
+async function readModelRunImageFiles(files: string[] | undefined): Promise<ModelRunImageFile[]> {
+  if (!files || files.length === 0) {
+    return [];
+  }
+  return await Promise.all(
+    files.map(async (filePath) => {
+      const resolvedPath = path.resolve(filePath);
+      const buffer = await fs.readFile(resolvedPath);
+      const mimeType = normalizeMimeType(
+        await detectMime({
+          buffer,
+          filePath: resolvedPath,
+        }),
+      );
+      if (!mimeType?.startsWith("image/")) {
+        throw new Error(
+          `Unsupported --file for model run: ${resolvedPath}. Only image files are supported; use infer audio transcribe for audio files.`,
+        );
+      }
+      return {
+        path: resolvedPath,
+        fileName: path.basename(resolvedPath),
+        mimeType,
+        data: buffer.toString("base64"),
+      };
+    }),
+  );
+}
+
 async function runModelRun(params: {
  prompt: string;
+  files?: string[];
  model?: string;
  transport: CapabilityTransport;
 }) {
  const cfg = getRuntimeConfig();
  const agentId = resolveDefaultAgentId(cfg);
+  const imageFiles = await readModelRunImageFiles(params.files);
+  const messageContent =
+    imageFiles.length > 0
+      ? [
+          { type: "text" as const, text: params.prompt },
+          ...imageFiles.map((image) => ({
+            type: "image" as const,
+            data: image.data,
+            mimeType: image.mimeType,
+          })),
+        ]
+      : params.prompt;
  if (params.transport === "local") {
    const prepared = await prepareSimpleCompletionModelForAgent({
      cfg,
@@ -609,7 +659,7 @@ async function runModelRun(params: {
        messages: [
          {
            role: "user",
-            content: params.prompt,
+            content: messageContent,
            timestamp: Date.now(),
          },
        ],
@@ -634,6 +684,14 @@ async function runModelRun(params: {
      provider: prepared.selection.provider,
      model: prepared.selection.modelId,
      attempts: [],
+      ...(imageFiles.length > 0
+        ? {
+            inputs: imageFiles.map((image) => ({
+              path: image.path,
+              mimeType: image.mimeType,
+            })),
+          }
+        : {}),
      outputs: [
        {
          text,
@@ -654,6 +712,15 @@ async function runModelRun(params: {
    params: {
      agentId,
      message: params.prompt,
+      attachments:
+        imageFiles.length > 0
+          ? imageFiles.map((image) => ({
+              type: "image",
+              fileName: image.fileName,
+              mimeType: image.mimeType,
+              content: image.data,
+            }))
+          : undefined,
      provider,
      model,
      modelRun: true,
@@ -678,6 +745,14 @@ async function runModelRun(params: {
      mediaUrl: payload.mediaUrl,
      mediaUrls: payload.mediaUrls,
    })),
+    ...(imageFiles.length > 0
+      ? {
+          inputs: imageFiles.map((image) => ({
+            path: image.path,
+            mimeType: image.mimeType,
+          })),
+        }
+      : {}),
  } satisfies CapabilityEnvelope;
 }

@@ -1494,6 +1569,7 @@ export function registerCapabilityCli(program: Command) {
    .command("run")
    .description("Run a one-shot model turn")
    .requiredOption("--prompt <text>", "Prompt text")
+    .option("--file <path>", "Image file", collectOption, [])
    .option("--model <provider/model>", "Model override")
    .option("--local", "Force local execution", false)
    .option("--gateway", "Force gateway execution", false)
@@ -1509,6 +1585,7 @@ export function registerCapabilityCli(program: Command) {
        });
        const result = await runModelRun({
          prompt,
+          files: opts.file as string[] | undefined,
          model: opts.model as string | undefined,
          transport,
        });