diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e72b442a86..0d3dded7b5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai keeping those envelopes out of ACP transcripts. - TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993. - Agents/Gateway: steer agent-driven config edits and restarts through the owner-only `gateway` tool, document `config.schema.lookup` as the field-doc source, and warn against using `gateway stop && gateway start` as a restart substitute on macOS. Fixes #71929. Thanks @ygc3817922006-sketch. +- Media understanding/audio: inject a deterministic transcript placeholder for too-small voice notes so agents do not hallucinate transcription or provider failures. Fixes #48944. Thanks @eulicesl. - Providers/vLLM: send Nemotron 3 chat-template kwargs when thinking is off and honor configured `params.chat_template_kwargs` for OpenAI-compatible completions, so vLLM/Nemotron replies stay visible instead of becoming diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index f68cfa5c665..d55eb354b3a 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -130,7 +130,7 @@ Recommended defaults: Rules: - If media exceeds `maxBytes`, that model is skipped and the **next model is tried**. -- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription. +- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription; inbound reply context receives a deterministic placeholder transcript so the agent knows the note was too small. - If the model returns more than `maxChars`, output is trimmed. - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only). - If the active primary image model already supports vision natively, OpenClaw diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 0f2cc3b0a1f..9789a748078 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -459,8 +459,7 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript"); }); - it("skips URL-only audio when remote file is too small", async () => { - // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES) + it("injects a placeholder transcript when URL-only audio is too small", async () => { mockedFetchRemoteMedia.mockResolvedValueOnce({ buffer: Buffer.alloc(100), contentType: "audio/ogg", @@ -499,7 +498,66 @@ describe("applyMediaUnderstanding", () => { }); expect(transcribeAudio).not.toHaveBeenCalled(); - expect(result.appliedAudio).toBe(false); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note could not be transcribed because the audio attachment was too small]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note could not be transcribed because the audio attachment was too small]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]", + ); + }); + + it("injects a placeholder transcript when local-path audio is too small", async () => { + const ctx = await createAudioCtx({ + fileName: "tiny.ogg", + mediaType: "audio/ogg", + content: Buffer.alloc(100), + }); + const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" })); + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { id: "groq", transcribeAudio }, + }, + }); + + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(result.appliedAudio).toBe(true); + expect(result.outputs).toEqual([ + expect.objectContaining({ + kind: "audio.transcription", + text: "[Voice note could not be transcribed because the audio attachment was too small]", + provider: "openclaw", + model: "synthetic-empty-audio", + }), + ]); + expect(ctx.Transcript).toBe( + "[Voice note could not be transcribed because the audio attachment was too small]", + ); + expect(ctx.Body).toBe( + "[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]", + ); }); it("skips audio transcription when attachment exceeds maxBytes", async () => { @@ -969,6 +1027,56 @@ describe("applyMediaUnderstanding", () => { ); }); + it("adds placeholder for tooSmall audio while preserving real transcript for valid audio", async () => { + const dir = await createTempMediaDir(); + const validAudio = createSafeAudioFixtureBuffer(2048); + const tinyAudio = Buffer.alloc(100); + const validPath = path.join(dir, "valid.ogg"); + const tinyPath = path.join(dir, "tiny.ogg"); + await fs.writeFile(validPath, validAudio); + await fs.writeFile(tinyPath, tinyAudio); + + const ctx: MsgContext = { + Body: "", + MediaPaths: [validPath, tinyPath], + MediaTypes: ["audio/ogg", "audio/ogg"], + }; + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + attachments: { mode: "all", maxAttachments: 2 }, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async (req) => ({ text: `transcribed ${req.fileName ?? "unknown"}` }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toContain("transcribed valid.ogg"); + expect(ctx.Transcript).toContain( + "[Voice note could not be transcribed because the audio attachment was too small]", + ); + expect(ctx.Body).toContain("[Audio 1/2]"); + expect(ctx.Body).toContain("transcribed valid.ogg"); + expect(ctx.Body).toContain("[Audio 2/2]"); + expect(ctx.Body).toContain( + "[Voice note could not be transcribed because the audio attachment was too small]", + ); + }); + it("orders mixed media outputs as image, audio, video", async () => { const dir = await createTempMediaDir(); const imagePath = path.join(dir, "photo.jpg"); @@ -1028,6 +1136,68 @@ describe("applyMediaUnderstanding", () => { expect(ctx.BodyForCommands).toBe("audio ok"); }); + it("orders synthetic too-small audio output between image and video", async () => { + const dir = await createTempMediaDir(); + const imagePath = path.join(dir, "photo.jpg"); + const audioPath = path.join(dir, "silent.ogg"); + const videoPath = path.join(dir, "clip.mp4"); + await fs.writeFile(imagePath, "image-bytes"); + await fs.writeFile(audioPath, Buffer.alloc(100)); + await fs.writeFile(videoPath, "video-bytes"); + + const ctx: MsgContext = { + Body: "", + MediaPaths: [imagePath, audioPath, videoPath], + MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"], + }; + const cfg: OpenClawConfig = { + tools: { + media: { + image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.4" }] }, + audio: { enabled: true, models: [{ provider: "groq" }] }, + video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + agentDir: dir, + providers: { + openai: { + id: "openai", + describeImage: async () => ({ text: "image ok" }), + }, + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "audio should not run" }), + }, + google: { + id: "google", + describeVideo: async () => ({ text: "video ok" }), + }, + }, + }); + + const placeholder = + "[Voice note could not be transcribed because the audio attachment was too small]"; + + expect(result.appliedImage).toBe(true); + expect(result.appliedAudio).toBe(true); + expect(result.appliedVideo).toBe(true); + expect(ctx.Body).toBe( + [ + "[Image]\nDescription:\nimage ok", + `[Audio]\nTranscript:\n${placeholder}`, + "[Video]\nDescription:\nvideo ok", + ].join("\n\n"), + ); + expect(ctx.Transcript).toBe(placeholder); + expect(ctx.CommandBody).toBe(placeholder); + expect(ctx.BodyForCommands).toBe(placeholder); + }); + it("treats text-like attachments as CSV (comma wins over tabs)", async () => { const csvText = '"a","b"\t"c"\n"1","2"\t"3"'; const csvPath = await createTempMediaFile({ diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 2b87dfa87b0..2c8b65f4ed2 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -48,6 +48,8 @@ export type ApplyMediaUnderstandingResult = { }; const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +const EMPTY_VOICE_NOTE_PLACEHOLDER = + "[Voice note could not be transcribed because the audio attachment was too small]"; const EXTRA_TEXT_MIMES = [ "application/xml", "text/xml", @@ -306,6 +308,32 @@ function resolveTextMimeFromName(name?: string): string | undefined { return TEXT_EXT_MIME.get(ext); } +function buildSyntheticSkippedAudioOutputs( + decisions: MediaUnderstandingDecision[], +): MediaUnderstandingOutput[] { + const audioDecision = decisions.find((decision) => decision.capability === "audio"); + if (!audioDecision) { + return []; + } + return audioDecision.attachments.flatMap((attachment) => { + const hasTooSmallAttempt = attachment.attempts.some((attempt) => + attempt.reason?.trim().startsWith("tooSmall"), + ); + if (!hasTooSmallAttempt) { + return []; + } + return [ + { + kind: "audio.transcription" as const, + attachmentIndex: attachment.attachmentIndex, + text: EMPTY_VOICE_NOTE_PLACEHOLDER, + provider: "openclaw", + model: "synthetic-empty-audio", + }, + ]; + }); +} + function isBinaryMediaMime(mime?: string): boolean { if (!mime) { return false; @@ -527,6 +555,54 @@ export async function applyMediaUnderstanding(params: { decisions.push(entry.decision); } + const audioOutputAttachmentIndexes = new Set( + outputs + .filter((output) => output.kind === "audio.transcription") + .map((output) => output.attachmentIndex), + ); + const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter( + (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), + ); + + // Merge synthetic placeholders into the audio slice while preserving the + // selected audio attachment order from `runCapability()` / `attachments.prefer`. + // When audio produced no real outputs, insert the synthetic slice at the + // audio capability slot (before video) instead of appending at the end. + if (syntheticSkippedAudioOutputs.length > 0) { + const audioDecision = decisions.find((decision) => decision.capability === "audio"); + const audioAttachmentOrder = + audioDecision?.attachments.map((attachment) => attachment.attachmentIndex) ?? []; + const audioOutputsByAttachmentIndex = new Map(); + for (const output of outputs) { + if (output.kind === "audio.transcription") { + audioOutputsByAttachmentIndex.set(output.attachmentIndex, output); + } + } + for (const output of syntheticSkippedAudioOutputs) { + audioOutputsByAttachmentIndex.set(output.attachmentIndex, output); + } + const mergedAudio = audioAttachmentOrder + .map((attachmentIndex) => audioOutputsByAttachmentIndex.get(attachmentIndex)) + .filter((output): output is MediaUnderstandingOutput => Boolean(output)); + + const firstAudioIdx = outputs.findIndex((o) => o.kind === "audio.transcription"); + if (firstAudioIdx >= 0) { + const before = outputs.slice(0, firstAudioIdx); + const afterLastAudio = outputs.slice( + outputs.reduce( + (last, o, i) => (o.kind === "audio.transcription" ? i : last), + firstAudioIdx, + ) + 1, + ); + outputs.length = 0; + outputs.push(...before, ...mergedAudio, ...afterLastAudio); + } else { + const firstVideoIdx = outputs.findIndex((o) => o.kind === "video.description"); + const audioInsertIdx = firstVideoIdx >= 0 ? firstVideoIdx : outputs.length; + outputs.splice(audioInsertIdx, 0, ...mergedAudio); + } + } + if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; } @@ -560,9 +636,19 @@ export async function applyMediaUnderstanding(params: { } ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; } + // Only skip file extraction for attachments that have a real (non-synthetic) + // audio transcription. Synthetic placeholders should not prevent file extraction + // for tiny audio-MIME files that could be recovered as text via forcedTextMime. + const syntheticAudioIndexes = new Set( + syntheticSkippedAudioOutputs.map((o) => o.attachmentIndex), + ); const audioAttachmentIndexes = new Set( outputs - .filter((output) => output.kind === "audio.transcription") + .filter( + (output) => + output.kind === "audio.transcription" && + !syntheticAudioIndexes.has(output.attachmentIndex), + ) .map((output) => output.attachmentIndex), ); const fileBlocks = await extractFileBlocks({ diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts index 172ecadf985..57ea5ebf83b 100644 --- a/src/media-understanding/format.test.ts +++ b/src/media-understanding/format.test.ts @@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => { }); expect(body).toBe("[Image]\nDescription:\na cat"); }); + + it("labels audio transcripts by their attachment order", () => { + const body = formatMediaUnderstandingBody({ + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "first clip was silent", + provider: "openclaw", + }, + { + kind: "audio.transcription", + attachmentIndex: 1, + text: "second clip has speech", + provider: "groq", + }, + ], + }); + expect(body).toBe( + [ + "[Audio 1/2]\nTranscript:\nfirst clip was silent", + "[Audio 2/2]\nTranscript:\nsecond clip has speech", + ].join("\n\n"), + ); + }); });