From fa7de462610f6699640b9abee650ce14903fe123 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 2 May 2026 07:47:15 +0100
Subject: [PATCH] fix(cli): report missing infer media providers

---
 CHANGELOG.md                             |  1 +
 src/cli/capability-cli.test.ts           | 69 ++++++++++++++++++++++++
 src/cli/capability-cli.ts                | 20 +++++++
 src/media-understanding/runtime-types.ts | 11 +++-
 src/media-understanding/runtime.test.ts  | 34 ++++++++++++
 src/media-understanding/runtime.ts       | 16 ++++--
 6 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6de79545943..bf90a540430 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski.
 - Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar.
 - Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen.
 - Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010.
diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts
index 4d2d89d7649..c6f1fee96a8 100644
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -782,6 +782,51 @@ describe("capability cli", () => {
     );
   });
 
+  it("reports missing image understanding configuration for image describe", async () => {
+    mocks.describeImageFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "image",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No image understanding provider is configured or ready"),
+    );
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("agents.defaults.imageModel.primary"),
+    );
+  });
+
+  it("reports missing image understanding configuration for image describe-many", async () => {
+    mocks.describeImageFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "image",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No image understanding provider is configured or ready"),
+    );
+  });
+
   it("rewrites mismatched explicit image output extensions to the detected file type", async () => {
     const jpegBase64 =
       "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z";
@@ -1278,6 +1323,30 @@ describe("capability cli", () => {
     );
   });
 
+  it("reports missing audio transcription configuration for audio transcribe", async () => {
+    mocks.transcribeAudioFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "audio",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No audio transcription provider is configured or ready"),
+    );
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("tools.media.audio.models"),
+    );
+  });
+
   it("surfaces the underlying transcription failure for audio transcribe", async () => {
     mocks.transcribeAudioFile.mockRejectedValueOnce(
       new Error("Audio transcription response missing text"),
diff --git a/src/cli/capability-cli.ts b/src/cli/capability-cli.ts
index 185e0e275ed..f74e693bace 100644
--- a/src/cli/capability-cli.ts
+++ b/src/cli/capability-cli.ts
@@ -30,6 +30,7 @@ import type {
   ImageGenerationOutputFormat,
 } from "../image-generation/types.js";
 import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
+import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js";
 import {
   describeImageFile,
   describeImageFileWithModel,
@@ -964,6 +965,11 @@ async function runImageDescribe(params: {
             timeoutMs: params.timeoutMs,
           });
       if (!result.text) {
+        if (isMissingMediaUnderstandingProvider(result)) {
+          throw new Error(
+            "No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model <provider/model> after configuring that provider's auth/API key.",
+          );
+        }
         throw new Error(`No description returned for image: ${resolvedPath}`);
       }
       return {
@@ -986,6 +992,15 @@ async function runImageDescribe(params: {
   } satisfies CapabilityEnvelope;
 }
 
+function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean {
+  const decision = result.decision;
+  return (
+    decision?.outcome === "skipped" &&
+    decision.attachments.length > 0 &&
+    decision.attachments.every((attachment) => attachment.attempts.length === 0)
+  );
+}
+
 async function runAudioTranscribe(params: {
   file: string;
   language?: string;
@@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: {
     prompt: params.prompt,
   });
   if (!result.text) {
+    if (isMissingMediaUnderstandingProvider(result)) {
+      throw new Error(
+        "No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model <provider/model> after configuring that provider's auth/API key.",
+      );
+    }
     throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`);
   }
   return {
diff --git a/src/media-understanding/runtime-types.ts b/src/media-understanding/runtime-types.ts
index b31d062eb58..8c4e9831523 100644
--- a/src/media-understanding/runtime-types.ts
+++ b/src/media-understanding/runtime-types.ts
@@ -1,6 +1,10 @@
 import type { OpenClawConfig } from "../config/types.js";
 import type { ActiveMediaModel } from "./active-model.types.js";
-import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
+import type {
+  MediaUnderstandingDecision,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "./types.js";
 
 export type RunMediaUnderstandingFileParams = {
   capability: "image" | "audio" | "video";
@@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = {
   provider?: string;
   model?: string;
   output?: MediaUnderstandingOutput;
+  decision?: MediaUnderstandingDecision;
 };
 
 export type DescribeImageFileParams = {
@@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = {
     params: DescribeImageFileWithModelParams,
   ) => Promise<DescribeImageFileWithModelResult>;
   describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
-  transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
+  transcribeAudioFile: (
+    params: TranscribeAudioFileParams,
+  ) => Promise<RunMediaUnderstandingFileResult>;
 };
diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts
index f773d6e0011..78b1914d0a6 100644
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -63,12 +63,46 @@ describe("media-understanding runtime", () => {
       provider: undefined,
       model: undefined,
       output: undefined,
+      decision: { capability: "image", outcome: "disabled", attachments: [] },
     });
 
     expect(mocks.buildProviderRegistry).not.toHaveBeenCalled();
     expect(mocks.runCapability).not.toHaveBeenCalled();
   });
 
+  it("preserves skipped decisions when no media provider is available", async () => {
+    const decision = {
+      capability: "audio" as const,
+      outcome: "skipped" as const,
+      attachments: [{ attachmentIndex: 0, attempts: [] }],
+    };
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision,
+    });
+
+    await expect(
+      runMediaUnderstandingFile({
+        capability: "audio",
+        filePath: "/tmp/sample.ogg",
+        mime: "audio/ogg",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toEqual({
+      text: undefined,
+      provider: undefined,
+      model: undefined,
+      output: undefined,
+      decision,
+    });
+
+    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
+  });
+
   it("returns the matching capability output", async () => {
     const output: MediaUnderstandingOutput = {
       kind: "image.description",
diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts
index 65896ace322..28cacc5ee24 100644
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile(
   const ctx = buildFileContext(params);
   const attachments = normalizeMediaAttachments(ctx);
   if (attachments.length === 0) {
-    return { text: undefined };
+    return {
+      text: undefined,
+      decision: { capability: params.capability, outcome: "no-attachment", attachments: [] },
+    };
   }
   const config = cfg.tools?.media?.[params.capability];
   if (config?.enabled === false) {
@@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile(
       provider: undefined,
       model: undefined,
       output: undefined,
+      decision: { capability: params.capability, outcome: "disabled", attachments: [] },
     };
   }
 
@@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile(
       (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
     );
     const text = output?.text?.trim();
-    return {
+    const fileResult: RunMediaUnderstandingFileResult = {
       text: text || undefined,
       provider: output?.provider,
       model: output?.model,
       output,
     };
+    if (result.decision) {
+      fileResult.decision = result.decision;
+    }
+    return fileResult;
   } finally {
     await cache.cleanup();
   }
@@ -171,7 +179,7 @@ export async function describeVideoFile(
 
 export async function transcribeAudioFile(
   params: TranscribeAudioFileParams,
-): Promise<{ text: string | undefined }> {
+): Promise<RunMediaUnderstandingFileResult> {
   const cfg =
     params.language || params.prompt
       ? {
@@ -192,5 +200,5 @@ export async function transcribeAudioFile(
         }
       : params.cfg;
   const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" });
-  return { text: result.text };
+  return result;
 }