fix(cli): report missing infer media providers

2026-05-06 05:10:44 +00:00 · 2026-05-02 07:47:15 +01:00
parent 798515809c
commit fa7de46261
6 changed files with 145 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Infer/media: report missing image-understanding and audio-transcription provider configuration for `image describe`, `image describe-many`, and `audio transcribe` instead of blaming the input path when no provider is available. Fixes #73569 and supersedes #73593, #74288, and #74495. Thanks @bittoby, @tmimmanuel, @Linux2010, and @vyctorbrzezowski.
 - Active Memory: use the configured recall timeout as the blocking prompt-build hook budget by default and move cold-start setup grace behind explicit `setupGraceTimeoutMs` config, so the plugin no longer silently extends 15000 ms configs to 45000 ms on the main lane. Fixes #75843. Thanks @vishutdhar.
 - Plugins/web-provider: reuse the active gateway plugin registry for runtime web provider resolution after deriving the same candidate plugin ids as the loader path, avoiding a redundant `loadOpenClawPlugins` call on every request while preserving origin and scope filters. Fixes #75513. Thanks @jochen.
 - Crestodian/CLI: exit non-zero when interactive Crestodian is invoked without a TTY, so scripts and CI no longer treat the setup error as success. Fixes #73646 and supersedes #73928 and #74059. Thanks @bittoby, @luyao618, and @Linux2010.
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -782,6 +782,51 @@ describe("capability cli", () => {
    );
  });

+  it("reports missing image understanding configuration for image describe", async () => {
+    mocks.describeImageFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "image",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "image", "describe", "--file", "photo.jpg", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No image understanding provider is configured or ready"),
+    );
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("agents.defaults.imageModel.primary"),
+    );
+  });
+
+  it("reports missing image understanding configuration for image describe-many", async () => {
+    mocks.describeImageFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "image",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "image", "describe-many", "--file", "photo.jpg", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No image understanding provider is configured or ready"),
+    );
+  });
+
  it("rewrites mismatched explicit image output extensions to the detected file type", async () => {
    const jpegBase64 =
      "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxAQEBUQEBAVFRUVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolHRUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi0fHyUtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAAEAAQMBIgACEQEDEQH/xAAXAAEBAQEAAAAAAAAAAAAAAAAAAQID/8QAFhEBAQEAAAAAAAAAAAAAAAAAAAER/9oADAMBAAIQAxAAAAH2AP/EABgQAQEAAwAAAAAAAAAAAAAAAAEAEQIS/9oACAEBAAEFAk1o7//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAgBAwEBPwGn/8QAFhEBAQEAAAAAAAAAAAAAAAAAABEB/9oACAECAQE/AYf/xAAaEAACAgMAAAAAAAAAAAAAAAABEQAhMUFh/9oACAEBAAY/AjK9cY2f/8QAGhABAQACAwAAAAAAAAAAAAAAAAERITFBUf/aAAgBAQABPyGQk7W5jVYkA//Z";
@@ -1278,6 +1323,30 @@ describe("capability cli", () => {
    );
  });

+  it("reports missing audio transcription configuration for audio transcribe", async () => {
+    mocks.transcribeAudioFile.mockResolvedValueOnce({
+      text: undefined,
+      decision: {
+        capability: "audio",
+        outcome: "skipped",
+        attachments: [{ attachmentIndex: 0, attempts: [] }],
+      },
+    } as never);
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("No audio transcription provider is configured or ready"),
+    );
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringContaining("tools.media.audio.models"),
+    );
+  });
+
  it("surfaces the underlying transcription failure for audio transcribe", async () => {
    mocks.transcribeAudioFile.mockRejectedValueOnce(
      new Error("Audio transcription response missing text"),
--- a/src/cli/capability-cli.ts
+++ b/src/cli/capability-cli.ts
@@ -30,6 +30,7 @@ import type {
  ImageGenerationOutputFormat,
 } from "../image-generation/types.js";
 import { buildMediaUnderstandingRegistry } from "../media-understanding/provider-registry.js";
+import type { RunMediaUnderstandingFileResult } from "../media-understanding/runtime-types.js";
 import {
  describeImageFile,
  describeImageFileWithModel,
@@ -964,6 +965,11 @@ async function runImageDescribe(params: {
            timeoutMs: params.timeoutMs,
          });
      if (!result.text) {
+        if (isMissingMediaUnderstandingProvider(result)) {
+          throw new Error(
+            "No image understanding provider is configured or ready. Configure tools.media.image.models or agents.defaults.imageModel.primary, or pass --model <provider/model> after configuring that provider's auth/API key.",
+          );
+        }
        throw new Error(`No description returned for image: ${resolvedPath}`);
      }
      return {
@@ -986,6 +992,15 @@ async function runImageDescribe(params: {
  } satisfies CapabilityEnvelope;
 }

+function isMissingMediaUnderstandingProvider(result: RunMediaUnderstandingFileResult): boolean {
+  const decision = result.decision;
+  return (
+    decision?.outcome === "skipped" &&
+    decision.attachments.length > 0 &&
+    decision.attachments.every((attachment) => attachment.attempts.length === 0)
+  );
+}
+
 async function runAudioTranscribe(params: {
  file: string;
  language?: string;
@@ -1002,6 +1017,11 @@ async function runAudioTranscribe(params: {
    prompt: params.prompt,
  });
  if (!result.text) {
+    if (isMissingMediaUnderstandingProvider(result)) {
+      throw new Error(
+        "No audio transcription provider is configured or ready. Configure tools.media.audio.models, or pass --model <provider/model> after configuring that provider's auth/API key.",
+      );
+    }
    throw new Error(`No transcript returned for audio: ${path.resolve(params.file)}`);
  }
  return {
--- a/src/media-understanding/runtime-types.ts
+++ b/src/media-understanding/runtime-types.ts
@@ -1,6 +1,10 @@
 import type { OpenClawConfig } from "../config/types.js";
 import type { ActiveMediaModel } from "./active-model.types.js";
-import type { MediaUnderstandingOutput, MediaUnderstandingProvider } from "./types.js";
+import type {
+  MediaUnderstandingDecision,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "./types.js";

 export type RunMediaUnderstandingFileParams = {
  capability: "image" | "audio" | "video";
@@ -18,6 +22,7 @@ export type RunMediaUnderstandingFileResult = {
  provider?: string;
  model?: string;
  output?: MediaUnderstandingOutput;
+  decision?: MediaUnderstandingDecision;
 };

 export type DescribeImageFileParams = {
@@ -73,5 +78,7 @@ export type MediaUnderstandingRuntime = {
    params: DescribeImageFileWithModelParams,
  ) => Promise<DescribeImageFileWithModelResult>;
  describeVideoFile: (params: DescribeVideoFileParams) => Promise<RunMediaUnderstandingFileResult>;
-  transcribeAudioFile: (params: TranscribeAudioFileParams) => Promise<{ text: string | undefined }>;
+  transcribeAudioFile: (
+    params: TranscribeAudioFileParams,
+  ) => Promise<RunMediaUnderstandingFileResult>;
 };
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -63,12 +63,46 @@ describe("media-understanding runtime", () => {
      provider: undefined,
      model: undefined,
      output: undefined,
+      decision: { capability: "image", outcome: "disabled", attachments: [] },
    });

    expect(mocks.buildProviderRegistry).not.toHaveBeenCalled();
    expect(mocks.runCapability).not.toHaveBeenCalled();
  });

+  it("preserves skipped decisions when no media provider is available", async () => {
+    const decision = {
+      capability: "audio" as const,
+      outcome: "skipped" as const,
+      attachments: [{ attachmentIndex: 0, attempts: [] }],
+    };
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision,
+    });
+
+    await expect(
+      runMediaUnderstandingFile({
+        capability: "audio",
+        filePath: "/tmp/sample.ogg",
+        mime: "audio/ogg",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).resolves.toEqual({
+      text: undefined,
+      provider: undefined,
+      model: undefined,
+      output: undefined,
+      decision,
+    });
+
+    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
+  });
+
  it("returns the matching capability output", async () => {
    const output: MediaUnderstandingOutput = {
      kind: "image.description",
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -84,7 +84,10 @@ export async function runMediaUnderstandingFile(
  const ctx = buildFileContext(params);
  const attachments = normalizeMediaAttachments(ctx);
  if (attachments.length === 0) {
-    return { text: undefined };
+    return {
+      text: undefined,
+      decision: { capability: params.capability, outcome: "no-attachment", attachments: [] },
+    };
  }
  const config = cfg.tools?.media?.[params.capability];
  if (config?.enabled === false) {
@@ -93,6 +96,7 @@ export async function runMediaUnderstandingFile(
      provider: undefined,
      model: undefined,
      output: undefined,
+      decision: { capability: params.capability, outcome: "disabled", attachments: [] },
    };
  }

@@ -124,12 +128,16 @@ export async function runMediaUnderstandingFile(
      (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
    );
    const text = output?.text?.trim();
-    return {
+    const fileResult: RunMediaUnderstandingFileResult = {
      text: text || undefined,
      provider: output?.provider,
      model: output?.model,
      output,
    };
+    if (result.decision) {
+      fileResult.decision = result.decision;
+    }
+    return fileResult;
  } finally {
    await cache.cleanup();
  }
@@ -171,7 +179,7 @@ export async function describeVideoFile(

 export async function transcribeAudioFile(
  params: TranscribeAudioFileParams,
-): Promise<{ text: string | undefined }> {
+): Promise<RunMediaUnderstandingFileResult> {
  const cfg =
    params.language || params.prompt
      ? {
@@ -192,5 +200,5 @@ export async function transcribeAudioFile(
        }
      : params.cfg;
  const result = await runMediaUnderstandingFile({ ...params, cfg, capability: "audio" });
-  return { text: result.text };
+  return result;
 }