From 1f0431cd11fb7e6298ce5fb1cdc306746d59a0b3 Mon Sep 17 00:00:00 2001 From: Daniel Alkurdi Date: Mon, 13 Apr 2026 02:05:18 +1000 Subject: [PATCH] fix(media): surface OpenAI audio transcription failures (#65096) * fix(media): surface audio transcription provider failures * fix(media): prefer failed reasons in surfaced errors * fix(media): import attempt outcome type * fix(media): guard malformed decision arrays --------- Co-authored-by: Vincent Koc --- CHANGELOG.md | 2 + src/auto-reply/status.test.ts | 37 +++++++++++++ src/auto-reply/status.ts | 12 +++-- src/cli/capability-cli.test.ts | 16 ++++++ src/media-understanding/runner.entries.ts | 52 +++++++++++++++---- .../runner.skip-tiny-audio.test.ts | 26 ++++++++++ src/media-understanding/runner.ts | 18 ++++++- src/media-understanding/runtime.test.ts | 39 ++++++++++++++ src/media-understanding/runtime.ts | 13 +++++ src/media-understanding/types.ts | 1 + 10 files changed, 201 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f71838d0645..df4e8d2a958 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,9 @@ Docs: https://docs.openclaw.ai - Gateway/plugins: always send a non-empty `idempotencyKey` for plugin subagent runs, so dreaming narrative jobs stop failing gateway schema validation. (#65354) Thanks @CodeForgeNet and @vincentkoc. - Cron/isolated sessions: persist the right transcript path for each isolated run, including fresh session rollovers, so cron runs stop appending to stale session files. Thanks @samrusani and @vincentkoc. - Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc. +<<<<<<< HEAD - QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc. +- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc. ## 2026.4.11 diff --git a/src/auto-reply/status.test.ts b/src/auto-reply/status.test.ts index 3d5238cdf6b..4a836162b7f 100644 --- a/src/auto-reply/status.test.ts +++ b/src/auto-reply/status.test.ts @@ -739,6 +739,43 @@ describe("buildStatusMessage", () => { expect(normalized).toContain("Media: image ok (openai/gpt-5.4) ยท audio skipped (maxBytes)"); }); + it("includes failed media understanding decisions with the surfaced reason", () => { + const text = buildStatusMessage({ + agent: { model: "anthropic/claude-opus-4-6" }, + sessionEntry: { sessionId: "media-failed", updatedAt: 0 }, + sessionKey: "agent:main:main", + queue: { mode: "none" }, + mediaDecisions: [ + { + capability: "audio", + outcome: "failed", + attachments: [ + { + attachmentIndex: 0, + attempts: [ + { + type: "provider", + outcome: "skipped", + reason: "empty output", + }, + { + type: "provider", + outcome: "failed", + reason: "Error: Audio transcription response missing text", + }, + ], + }, + ], + }, + ], + }); + + expect(normalizeTestText(text)).toContain( + "Media: audio failed (Audio transcription response missing text)", + ); + expect(normalizeTestText(text)).not.toContain("empty output"); + }); + it("omits media line when all decisions are none", () => { const text = buildStatusMessage({ agent: { model: "anthropic/claude-opus-4-6" }, diff --git a/src/auto-reply/status.ts b/src/auto-reply/status.ts index 5a4c800b6ee..175075216ff 100644 --- a/src/auto-reply/status.ts +++ b/src/auto-reply/status.ts @@ -26,6 +26,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; import { readLatestSessionUsageFromTranscript } from "../gateway/session-utils.fs.js"; import { formatTimeAgo } from "../infra/format-time/format-relative.ts"; import { resolveCommitHash } from "../infra/git-commit.js"; +import { findDecisionReason, summarizeDecisionReason } from "../media-understanding/runner.entries.js"; import type { MediaUnderstandingDecision } from "../media-understanding/types.js"; import { resolveAgentIdFromSessionKey } from "../routing/session-key.js"; import { @@ -375,12 +376,15 @@ const formatMediaUnderstandingLine = (decisions?: ReadonlyArray entry.attempts.map((attempt) => attempt.reason).filter(Boolean)) - .find(Boolean); - const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; + const reason = findDecisionReason(decision); + const shortReason = summarizeDecisionReason(reason); return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`; } + if (decision.outcome === "failed") { + const reason = findDecisionReason(decision, "failed"); + const shortReason = summarizeDecisionReason(reason); + return `${decision.capability} failed${shortReason ? ` (${shortReason})` : ""}`; + } return null; }) .filter((part): part is string => part != null); diff --git a/src/cli/capability-cli.test.ts b/src/cli/capability-cli.test.ts index 4add51dad14..8e786825d65 100644 --- a/src/cli/capability-cli.test.ts +++ b/src/cli/capability-cli.test.ts @@ -550,6 +550,22 @@ describe("capability cli", () => { ); }); + it("surfaces the underlying transcription failure for audio transcribe", async () => { + mocks.transcribeAudioFile.mockRejectedValueOnce( + new Error("Audio transcription response missing text"), + ); + + await expect( + runRegisteredCli({ + register: registerCapabilityCli as (program: Command) => void, + argv: ["capability", "audio", "transcribe", "--file", "memo.m4a", "--json"], + }), + ).rejects.toThrow("exit 1"); + expect(mocks.runtime.error).toHaveBeenCalledWith( + expect.stringMatching(/Audio transcription response missing text/), + ); + }); + it("forwards transcription prompt and language hints", async () => { await runRegisteredCli({ register: registerCapabilityCli as (program: Command) => void, diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 9e3b09d094d..0ffbbd7195a 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -36,6 +36,7 @@ import { extractGeminiResponse } from "./output-extract.js"; import { getMediaUnderstandingProvider, normalizeMediaProviderId } from "./provider-registry.js"; import { resolveMaxBytes, resolveMaxChars, resolvePrompt, resolveTimeoutMs } from "./resolve.js"; import type { + MediaUnderstandingAttemptOutcome, MediaUnderstandingCapability, MediaUnderstandingDecision, MediaUnderstandingModelDecision, @@ -444,21 +445,54 @@ export function formatDecisionSummary(decision: MediaUnderstandingDecision): str const provider = typeof chosen?.provider === "string" ? chosen.provider.trim() : undefined; const model = typeof chosen?.model === "string" ? chosen.model.trim() : undefined; const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : undefined; - const reason = attachments - .flatMap((entry) => { - const attempts = Array.isArray(entry?.attempts) ? entry.attempts : []; - return attempts - .map((attempt) => (typeof attempt?.reason === "string" ? attempt.reason : undefined)) - .filter((value): value is string => Boolean(value)); - }) - .find((value) => value.trim().length > 0); - const shortReason = reason ? reason.split(":")[0]?.trim() : undefined; + const reason = findDecisionReason( + decision, + decision.outcome === "failed" ? "failed" : undefined, + ); + const shortReason = summarizeDecisionReason(reason); const countLabel = total > 0 ? ` (${success}/${total})` : ""; const viaLabel = modelLabel ? ` via ${modelLabel}` : ""; const reasonLabel = shortReason ? ` reason=${shortReason}` : ""; return `${decision.capability}: ${decision.outcome}${countLabel}${viaLabel}${reasonLabel}`; } +export function findDecisionReason( + decision: MediaUnderstandingDecision, + outcome?: MediaUnderstandingAttemptOutcome, +): string | undefined { + const attachments = Array.isArray(decision.attachments) ? decision.attachments : []; + for (const attachment of attachments) { + const attempts = Array.isArray(attachment?.attempts) ? attachment.attempts : []; + for (const attempt of attempts) { + if (outcome && attempt.outcome !== outcome) { + continue; + } + if (typeof attempt.reason !== "string" || attempt.reason.trim().length === 0) { + continue; + } + return attempt.reason; + } + } + return undefined; +} + +export function normalizeDecisionReason(reason?: string): string | undefined { + const trimmed = typeof reason === "string" ? reason.trim() : ""; + if (!trimmed) { + return undefined; + } + const normalized = trimmed.replace(/^Error:\s*/i, "").trim(); + return normalized || undefined; +} + +export function summarizeDecisionReason(reason?: string): string | undefined { + const normalized = normalizeDecisionReason(reason); + if (!normalized) { + return undefined; + } + return normalized.split(":")[0]?.trim() || undefined; +} + function assertMinAudioSize(params: { size: number; attachmentIndex: number }): void { if (params.size >= MIN_AUDIO_FILE_BYTES) { return; diff --git a/src/media-understanding/runner.skip-tiny-audio.test.ts b/src/media-understanding/runner.skip-tiny-audio.test.ts index 59a2558355a..e9cf0b53e1a 100644 --- a/src/media-understanding/runner.skip-tiny-audio.test.ts +++ b/src/media-understanding/runner.skip-tiny-audio.test.ts @@ -182,4 +182,30 @@ describe("runCapability skips tiny audio files", () => { }, }); }); + + it("marks the decision as failed when every audio model attempt fails", async () => { + await withAudioFixture({ + filePrefix: "openclaw-failed-audio", + extension: "ogg", + mediaType: "audio/ogg", + fileContents: Buffer.alloc(MIN_AUDIO_FILE_BYTES + 100), + run: async ({ ctx, media, cache }) => { + const result = await runAudioCapabilityWithTranscriber({ + ctx, + media, + cache, + transcribeAudio: async () => { + throw new Error("upstream 500"); + }, + }); + + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("failed"); + expect(result.decision.attachments).toHaveLength(1); + expect(result.decision.attachments[0]?.attempts).toHaveLength(1); + expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("failed"); + expect(result.decision.attachments[0]?.attempts[0]?.reason).toContain("upstream 500"); + }, + }); + }); }); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index fcbdf8912cb..b420e59a3ce 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -20,6 +20,7 @@ import type { MediaUnderstandingModelConfig, } from "../config/types.tools.js"; import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { logWarn } from "../logger.js"; import { resolveChannelInboundAttachmentRoots } from "../media/channel-inbound-roots.js"; import { mergeInboundPathRoots } from "../media/inbound-path-policy.js"; import { getDefaultMediaLocalRoots } from "../media/local-roots.js"; @@ -725,6 +726,12 @@ async function runAttachmentEntries(params: { return { output: null, attempts }; } +function hasFailedMediaAttempt(attachments: MediaUnderstandingDecision["attachments"]): boolean { + return attachments.some((attachment) => + attachment.attempts.some((attempt) => attempt.outcome === "failed"), + ); +} + export async function runCapability(params: { capability: MediaUnderstandingCapability; cfg: OpenClawConfig; @@ -861,10 +868,17 @@ export async function runCapability(params: { } const decision: MediaUnderstandingDecision = { capability, - outcome: outputs.length > 0 ? "success" : "skipped", + outcome: + outputs.length > 0 + ? "success" + : hasFailedMediaAttempt(attachmentDecisions) + ? "failed" + : "skipped", attachments: attachmentDecisions, }; - if (shouldLogVerbose()) { + if (decision.outcome === "failed") { + logWarn(`media-understanding: ${formatDecisionSummary(decision)}`); + } else if (shouldLogVerbose()) { logVerbose(`Media understanding ${formatDecisionSummary(decision)}`); } return { diff --git a/src/media-understanding/runtime.test.ts b/src/media-understanding/runtime.test.ts index de13bb1e24a..0b63f8d7041 100644 --- a/src/media-understanding/runtime.test.ts +++ b/src/media-understanding/runtime.test.ts @@ -101,4 +101,43 @@ describe("media-understanding runtime", () => { expect(mocks.runCapability).toHaveBeenCalledTimes(1); expect(mocks.cleanup).toHaveBeenCalledTimes(1); }); + + it("surfaces the underlying provider failure when media understanding fails", async () => { + mocks.normalizeMediaAttachments.mockReturnValue([ + { index: 0, path: "/tmp/sample.ogg", mime: "audio/ogg" }, + ]); + mocks.runCapability.mockResolvedValue({ + outputs: [], + decision: { + capability: "audio", + outcome: "failed", + attachments: [ + { + attachmentIndex: 0, + attempts: [ + { + type: "provider", + provider: "openai", + model: "gpt-4o-mini-transcribe", + outcome: "failed", + reason: "Error: Audio transcription response missing text", + }, + ], + }, + ], + }, + }); + + await expect( + runMediaUnderstandingFile({ + capability: "audio", + filePath: "/tmp/sample.ogg", + mime: "audio/ogg", + cfg: {} as OpenClawConfig, + agentDir: "/tmp/agent", + }), + ).rejects.toThrow("Audio transcription response missing text"); + + expect(mocks.cleanup).toHaveBeenCalledTimes(1); + }); }); diff --git a/src/media-understanding/runtime.ts b/src/media-understanding/runtime.ts index cba5844c08e..20fea18f925 100644 --- a/src/media-understanding/runtime.ts +++ b/src/media-understanding/runtime.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { normalizeMediaProviderId } from "./provider-registry.js"; +import { findDecisionReason, normalizeDecisionReason } from "./runner.entries.js"; import { buildProviderRegistry, createMediaAttachmentCache, @@ -33,6 +34,12 @@ const KIND_BY_CAPABILITY: Record>["decision"], +): string | undefined { + return normalizeDecisionReason(findDecisionReason(decision, "failed")); +} + function buildFileContext(params: { filePath: string; mime?: string }) { return { MediaPath: params.filePath, @@ -75,6 +82,12 @@ export async function runMediaUnderstandingFile( config, activeModel: params.activeModel, }); + if (result.outputs.length === 0 && result.decision.outcome === "failed") { + throw new Error( + resolveDecisionFailureReason(result.decision) ?? + `${params.capability} understanding failed`, + ); + } const output = result.outputs.find( (entry) => entry.kind === KIND_BY_CAPABILITY[params.capability], ); diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 052b8569b08..69b7a35b5c5 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -32,6 +32,7 @@ export type MediaUnderstandingOutput = { export type MediaUnderstandingDecisionOutcome = | "success" + | "failed" | "skipped" | "disabled" | "no-attachment"