From 1f0431cd11fb7e6298ce5fb1cdc306746d59a0b3 Mon Sep 17 00:00:00 2001
From: Daniel Alkurdi <daniel@tideflow.au>
Date: Mon, 13 Apr 2026 02:05:18 +1000
Subject: [PATCH] fix(media): surface OpenAI audio transcription failures
 (#65096)

* fix(media): surface audio transcription provider failures

* fix(media): prefer failed reasons in surfaced errors

* fix(media): import attempt outcome type

* fix(media): guard malformed decision arrays

---------

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
---
 CHANGELOG.md                                  |  2 +
 src/auto-reply/status.test.ts                 | 37 +++++++++++++
 src/auto-reply/status.ts                      | 12 +++--
 src/cli/capability-cli.test.ts                | 16 ++++++
 src/media-understanding/runner.entries.ts     | 52 +++++++++++++++----
 .../runner.skip-tiny-audio.test.ts            | 26 ++++++++++
 src/media-understanding/runner.ts             | 18 ++++++-
 src/media-understanding/runtime.test.ts       | 39 ++++++++++++++
 src/media-understanding/runtime.ts            | 13 +++++
 src/media-understanding/types.ts              |  1 +
 10 files changed, 201 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f71838d0645..df4e8d2a958 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,7 +26,9 @@ Docs: https://docs.openclaw.ai
 - Gateway/plugins: always send a non-empty `idempotencyKey` for plugin subagent runs, so dreaming narrative jobs stop failing gateway schema validation. (#65354) Thanks @CodeForgeNet and @vincentkoc.
 - Cron/isolated sessions: persist the right transcript path for each isolated run, including fresh session rollovers, so cron runs stop appending to stale session files. Thanks @samrusani and @vincentkoc.
 - Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc.
+<<<<<<< HEAD
 - QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc.
+- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc.
 
 ## 2026.4.11
 
diff --git a/src/auto-reply/status.test.ts b/src/auto-reply/status.test.ts
index 3d5238cdf6b..4a836162b7f 100644
--- a/src/auto-reply/status.test.ts
+++ b/src/auto-reply/status.test.ts
@@ -739,6 +739,43 @@ describe("buildStatusMessage", () => {
     expect(normalized).toContain("Media: image ok (openai/gpt-5.4) · audio skipped (maxBytes)");
   });
 
+  it("includes failed media understanding decisions with the surfaced reason", () => {
+    const text = buildStatusMessage({
+      agent: { model: "anthropic/claude-opus-4-6" },
+      sessionEntry: { sessionId: "media-failed", updatedAt: 0 },
+      sessionKey: "agent:main:main",
+      queue: { mode: "none" },
+      mediaDecisions: [
+        {
+          capability: "audio",
+          outcome: "failed",
+          attachments: [
+            {
+              attachmentIndex: 0,
+              attempts: [
+              {
+                type: "provider",
+                outcome: "skipped",
+                reason: "empty output",
+              },
+              {
+                type: "provider",
+                outcome: "failed",
+                reason: "Error: Audio transcription response missing text",
+              },
+            ],
+            },
+          ],
+        },
+      ],
+    });
+
+    expect(normalizeTestText(text)).toContain(
+      "Media: audio failed (Audio transcription response missing text)",
+    );
+    expect(normalizeTestText(text)).not.toContain("empty output");
+  });
+
   it("omits media line when all decisions are none", () => {
     const text = buildStatusMessage({
       agent: { model: "anthropic/claude-opus-4-6" },
diff --git a/src/auto-reply/status.ts b/src/auto-reply/status.ts
index 5a4c800b6ee..175075216ff 100644
--- a/src/auto-reply/status.ts
+++ b/src/auto-reply/status.ts
@@ -26,6 +26,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
 import { readLatestSessionUsageFromTranscript } from "../gateway/session-utils.fs.js";
 import { formatTimeAgo } from "../infra/format-time/format-relative.ts";
 import { resolveCommitHash } from "../infra/git-commit.js";
+import { findDecisionReason, summarizeDecisionReason } from "../media-understanding/runner.entries.js";
 import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
 import { resolveAgentIdFromSessionKey } from "../routing/session-key.js";
 import {
@@ -375,12 +376,15 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray<MediaUnderstandi
         return `${decision.capability} denied`;
       }
       if (decision.outcome === "skipped") {
-        const reason = decision.attachments
-          .flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
-          .find(Boolean);
-        const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+        const reason = findDecisionReason(decision);
+        const shortReason = summarizeDecisionReason(reason);
         return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
       }
+      if (decision.outcome === "failed") {
+        const reason = findDecisionReason(decision, "failed");
+        const shortReason = summarizeDecisionReason(reason);
+        return `${decision.capability} failed${shortReason ? ` (${shortReason})` : ""}`;
+      }
       return null;
     })
     .filter((part): part is string => part != null);
diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts
index 4add51dad14..8e786825d65 100644
--- a/src/cli/capability-cli.test.ts
+++ b/src/cli/capability-cli.test.ts
@@ -550,6 +550,22 @@ describe("capability cli", () => {
     );
   });
 
+  it("surfaces the underlying transcription failure for audio transcribe", async () => {
+    mocks.transcribeAudioFile.mockRejectedValueOnce(
+      new Error("Audio transcription response missing text"),
+    );
+
+    await expect(
+      runRegisteredCli({
+        register: registerCapabilityCli as (program: Command) => void,
+        argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"],
+      }),
+    ).rejects.toThrow("exit 1");
+    expect(mocks.runtime.error).toHaveBeenCalledWith(
+      expect.stringMatching(/Audio transcription response missing text/),
+    );
+  });
+
   it("forwards transcription prompt and language hints", async () => {
     await runRegisteredCli({
       register: registerCapabilityCli as (program: Command) => void,
diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts
index 9e3b09d094d..0ffbbd7195a 100644
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -36,6 +36,7 @@ import { extractGeminiResponse } from "./output-extract.js";
 import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js";
 import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js";
 import type {
+  MediaUnderstandingAttemptOutcome,
   MediaUnderstandingCapability,
   MediaUnderstandingDecision,
   MediaUnderstandingModelDecision,
@@ -444,21 +445,54 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str
   const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined;
   const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined;
   const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined;
-  const reason = attachments
-    .flatMap((entry) => {
-      const attempts = Array.isArray(entry?.attempts) ? entry.attempts : [];
-      return attempts
-        .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined))
-        .filter((value): value is string => Boolean(value));
-    })
-    .find((value) => value.trim().length > 0);
-  const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
+  const reason = findDecisionReason(
+    decision,
+    decision.outcome === "failed" ? "failed" : undefined,
+  );
+  const shortReason = summarizeDecisionReason(reason);
   const countLabel = total > 0 ? ` (${success}/${total})` : "";
   const viaLabel = modelLabel ? ` via ${modelLabel}` : "";
   const reasonLabel = shortReason ? ` reason=${shortReason}` : "";
   return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`;
 }
 
+export function findDecisionReason(
+  decision: MediaUnderstandingDecision,
+  outcome?: MediaUnderstandingAttemptOutcome,
+): string | undefined {
+  const attachments = Array.isArray(decision.attachments) ? decision.attachments : [];
+  for (const attachment of attachments) {
+    const attempts = Array.isArray(attachment?.attempts) ? attachment.attempts : [];
+    for (const attempt of attempts) {
+      if (outcome && attempt.outcome !== outcome) {
+        continue;
+      }
+      if (typeof attempt.reason !== "string" || attempt.reason.trim().length === 0) {
+        continue;
+      }
+      return attempt.reason;
+    }
+  }
+  return undefined;
+}
+
+export function normalizeDecisionReason(reason?: string): string | undefined {
+  const trimmed = typeof reason === "string" ? reason.trim() : "";
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.replace(/^Error:\s*/i, "").trim();
+  return normalized || undefined;
+}
+
+export function summarizeDecisionReason(reason?: string): string | undefined {
+  const normalized = normalizeDecisionReason(reason);
+  if (!normalized) {
+    return undefined;
+  }
+  return normalized.split(":")[0]?.trim() || undefined;
+}
+
 function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void {
   if (params.size >= MIN_AUDIO_FILE_BYTES) {
     return;
diff --git a/src/media-understanding/runner.skip-tiny-audio.test.ts b/src/media-understanding/runner.skip-tiny-audio.test.ts
index 59a2558355a..e9cf0b53e1a 100644
--- a/src/media-understanding/runner.skip-tiny-audio.test.ts
+++ b/src/media-understanding/runner.skip-tiny-audio.test.ts
@@ -182,4 +182,30 @@ describe("runCapability skips tiny audio files", () => {
       },
     });
   });
+
+  it("marks the decision as failed when every audio model attempt fails", async () => {
+    await withAudioFixture({
+      filePrefix: "openclaw-failed-audio",
+      extension: "ogg",
+      mediaType: "audio/ogg",
+      fileContents: Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100),
+      run: async ({ ctx, media, cache }) => {
+        const result = await runAudioCapabilityWithTranscriber({
+          ctx,
+          media,
+          cache,
+          transcribeAudio: async () => {
+            throw new Error("upstream 500");
+          },
+        });
+
+        expect(result.outputs).toHaveLength(0);
+        expect(result.decision.outcome).toBe("failed");
+        expect(result.decision.attachments).toHaveLength(1);
+        expect(result.decision.attachments[0]?.attempts).toHaveLength(1);
+        expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("failed");
+        expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("upstream 500");
+      },
+    });
+  });
 });
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index fcbdf8912cb..b420e59a3ce 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -20,6 +20,7 @@ import type {
   MediaUnderstandingModelConfig,
 } from "../config/types.tools.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { logWarn } from "../logger.js";
 import { resolveChannelInboundAttachmentRoots } from "../media/channel-inbound-roots.js";
 import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
 import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
@@ -725,6 +726,12 @@ async function runAttachmentEntries(params: {
   return { output: null, attempts };
 }
 
+function hasFailedMediaAttempt(attachments: MediaUnderstandingDecision["attachments"]): boolean {
+  return attachments.some((attachment) =>
+    attachment.attempts.some((attempt) => attempt.outcome === "failed"),
+  );
+}
+
 export async function runCapability(params: {
   capability: MediaUnderstandingCapability;
   cfg: OpenClawConfig;
@@ -861,10 +868,17 @@ export async function runCapability(params: {
   }
   const decision: MediaUnderstandingDecision = {
     capability,
-    outcome: outputs.length > 0 ? "success" : "skipped",
+    outcome:
+      outputs.length > 0
+        ? "success"
+        : hasFailedMediaAttempt(attachmentDecisions)
+          ? "failed"
+          : "skipped",
     attachments: attachmentDecisions,
   };
-  if (shouldLogVerbose()) {
+  if (decision.outcome === "failed") {
+    logWarn(`media-understanding: ${formatDecisionSummary(decision)}`);
+  } else if (shouldLogVerbose()) {
     logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
   }
   return {
diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts
index de13bb1e24a..0b63f8d7041 100644
--- a/src/media-understanding/runtime.test.ts
+++ b/src/media-understanding/runtime.test.ts
@@ -101,4 +101,43 @@ describe("media-understanding runtime", () => {
     expect(mocks.runCapability).toHaveBeenCalledTimes(1);
     expect(mocks.cleanup).toHaveBeenCalledTimes(1);
   });
+
+  it("surfaces the underlying provider failure when media understanding fails", async () => {
+    mocks.normalizeMediaAttachments.mockReturnValue([
+      { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" },
+    ]);
+    mocks.runCapability.mockResolvedValue({
+      outputs: [],
+      decision: {
+        capability: "audio",
+        outcome: "failed",
+        attachments: [
+          {
+            attachmentIndex: 0,
+            attempts: [
+              {
+                type: "provider",
+                provider: "openai",
+                model: "gpt-4o-mini-transcribe",
+                outcome: "failed",
+                reason: "Error: Audio transcription response missing text",
+              },
+            ],
+          },
+        ],
+      },
+    });
+
+    await expect(
+      runMediaUnderstandingFile({
+        capability: "audio",
+        filePath: "/tmp/sample.ogg",
+        mime: "audio/ogg",
+        cfg: {} as OpenClawConfig,
+        agentDir: "/tmp/agent",
+      }),
+    ).rejects.toThrow("Audio transcription response missing text");
+
+    expect(mocks.cleanup).toHaveBeenCalledTimes(1);
+  });
 });
diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts
index cba5844c08e..20fea18f925 100644
--- a/src/media-understanding/runtime.ts
+++ b/src/media-understanding/runtime.ts
@@ -1,6 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { normalizeMediaProviderId } from "./provider-registry.js";
+import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js";
 import {
   buildProviderRegistry,
   createMediaAttachmentCache,
@@ -33,6 +34,12 @@ const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandin
   video: "video.description",
 };
 
+function resolveDecisionFailureReason(
+  decision: Awaited<ReturnType<typeof runCapability>>["decision"],
+): string | undefined {
+  return normalizeDecisionReason(findDecisionReason(decision, "failed"));
+}
+
 function buildFileContext(params: { filePath: string; mime?: string }) {
   return {
     MediaPath: params.filePath,
@@ -75,6 +82,12 @@ export async function runMediaUnderstandingFile(
       config,
       activeModel: params.activeModel,
     });
+    if (result.outputs.length === 0 && result.decision.outcome === "failed") {
+      throw new Error(
+        resolveDecisionFailureReason(result.decision) ??
+          `${params.capability} understanding failed`,
+      );
+    }
     const output = result.outputs.find(
       (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
     );
diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts
index 052b8569b08..69b7a35b5c5 100644
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -32,6 +32,7 @@ export type MediaUnderstandingOutput = {
 
 export type MediaUnderstandingDecisionOutcome =
   | "success"
+  | "failed"
   | "skipped"
   | "disabled"
   | "no-attachment"