fix: add placeholder transcript for silent voice notes (#49131)

* fix: add placeholder transcript for silent voice notes * fix: handle placeholder transcripts per skipped attachment * fix: preserve synthetic transcript attachment order * fix: scope synthetic audio merge to audio slice only, preserve cross-capability and prefer ordering Replace the global outputs.sort() with a targeted merge that: 1. Only sorts within the audio output slice (real + synthetic), preserving CAPABILITY_ORDER and per-capability attachments.prefer ordering for non-audio outputs. 2. Excludes synthetic placeholder indexes from audioAttachmentIndexes used by extractFileBlocks, so tiny audio-MIME files with text extensions can still be recovered via forcedTextMime. Adds mergeAudioOutputsPreservingAttachmentOrder helper. * fix: remove unused function and use toSorted() for oxlint compliance * fix(media-understanding): preserve selected audio order for synthetic placeholders - merge synthetic skipped-audio placeholders using audio decision order instead of raw attachmentIndex sorting, preserving attachments.prefer - insert synthetic-only audio outputs at the audio capability slot (before video) when no real audio outputs were produced * fix(media-understanding): use neutral too-small placeholder text Clarify that this synthetic transcript path is triggered by attachment size, not by a silence/no-speech detection result. * test(media-understanding): update too-small audio placeholder expectations * test(media-understanding): cover mixed too-small audio placeholder * test(media-understanding): cover too-small audio context * fix(tasks): preserve visible task title before internal context * Revert "fix(tasks): preserve visible task title before internal context" This reverts commit dc536fb4d3c8a01168de5d05e8562193dd68a88e. --------- Co-authored-by: Eulices Lopez <eulices@users.noreply.github.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-05-06 11:40:42 +00:00 · 2026-04-26 00:14:01 -04:00
parent bcc9fc4cf5
commit 008e4ca81f
5 changed files with 287 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai
  keeping those envelopes out of ACP transcripts.
 - TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993.
 - Agents/Gateway: steer agent-driven config edits and restarts through the owner-only `gateway` tool, document `config.schema.lookup` as the field-doc source, and warn against using `gateway stop && gateway start` as a restart substitute on macOS. Fixes #71929. Thanks @ygc3817922006-sketch.
+- Media understanding/audio: inject a deterministic transcript placeholder for too-small voice notes so agents do not hallucinate transcription or provider failures. Fixes #48944. Thanks @eulicesl.
 - Providers/vLLM: send Nemotron 3 chat-template kwargs when thinking is off
  and honor configured `params.chat_template_kwargs` for OpenAI-compatible
  completions, so vLLM/Nemotron replies stay visible instead of becoming
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -130,7 +130,7 @@ Recommended defaults:
 Rules:

 - If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription.
+- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription; inbound reply context receives a deterministic placeholder transcript so the agent knows the note was too small.
 - If the model returns more than `maxChars`, output is trimmed.
 - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
 - If the active primary image model already supports vision natively, OpenClaw
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -459,8 +459,7 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
  });

-  it("skips URL-only audio when remote file is too small", async () => {
-    // Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
+  it("injects a placeholder transcript when URL-only audio is too small", async () => {
    mockedFetchRemoteMedia.mockResolvedValueOnce({
      buffer: Buffer.alloc(100),
      contentType: "audio/ogg",
@@ -499,7 +498,66 @@ describe("applyMediaUnderstanding", () => {
    });

    expect(transcribeAudio).not.toHaveBeenCalled();
-    expect(result.appliedAudio).toBe(false);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note could not be transcribed because the audio attachment was too small]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note could not be transcribed because the audio attachment was too small]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
+    );
+  });
+
+  it("injects a placeholder transcript when local-path audio is too small", async () => {
+    const ctx = await createAudioCtx({
+      fileName: "tiny.ogg",
+      mediaType: "audio/ogg",
+      content: Buffer.alloc(100),
+    });
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: { id: "groq", transcribeAudio },
+      },
+    });
+
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(result.appliedAudio).toBe(true);
+    expect(result.outputs).toEqual([
+      expect.objectContaining({
+        kind: "audio.transcription",
+        text: "[Voice note could not be transcribed because the audio attachment was too small]",
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      }),
+    ]);
+    expect(ctx.Transcript).toBe(
+      "[Voice note could not be transcribed because the audio attachment was too small]",
+    );
+    expect(ctx.Body).toBe(
+      "[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
+    );
  });

  it("skips audio transcription when attachment exceeds maxBytes", async () => {
@@ -969,6 +1027,56 @@ describe("applyMediaUnderstanding", () => {
    );
  });

+  it("adds placeholder for tooSmall audio while preserving real transcript for valid audio", async () => {
+    const dir = await createTempMediaDir();
+    const validAudio = createSafeAudioFixtureBuffer(2048);
+    const tinyAudio = Buffer.alloc(100);
+    const validPath = path.join(dir, "valid.ogg");
+    const tinyPath = path.join(dir, "tiny.ogg");
+    await fs.writeFile(validPath, validAudio);
+    await fs.writeFile(tinyPath, tinyAudio);
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPaths: [validPath, tinyPath],
+      MediaTypes: ["audio/ogg", "audio/ogg"],
+    };
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            attachments: { mode: "all", maxAttachments: 2 },
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async (req) => ({ text: `transcribed ${req.fileName ?? "unknown"}` }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toContain("transcribed valid.ogg");
+    expect(ctx.Transcript).toContain(
+      "[Voice note could not be transcribed because the audio attachment was too small]",
+    );
+    expect(ctx.Body).toContain("[Audio 1/2]");
+    expect(ctx.Body).toContain("transcribed valid.ogg");
+    expect(ctx.Body).toContain("[Audio 2/2]");
+    expect(ctx.Body).toContain(
+      "[Voice note could not be transcribed because the audio attachment was too small]",
+    );
+  });
+
  it("orders mixed media outputs as image, audio, video", async () => {
    const dir = await createTempMediaDir();
    const imagePath = path.join(dir, "photo.jpg");
@@ -1028,6 +1136,68 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.BodyForCommands).toBe("audio ok");
  });

+  it("orders synthetic too-small audio output between image and video", async () => {
+    const dir = await createTempMediaDir();
+    const imagePath = path.join(dir, "photo.jpg");
+    const audioPath = path.join(dir, "silent.ogg");
+    const videoPath = path.join(dir, "clip.mp4");
+    await fs.writeFile(imagePath, "image-bytes");
+    await fs.writeFile(audioPath, Buffer.alloc(100));
+    await fs.writeFile(videoPath, "video-bytes");
+
+    const ctx: MsgContext = {
+      Body: "<media:mixed>",
+      MediaPaths: [imagePath, audioPath, videoPath],
+      MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"],
+    };
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.4" }] },
+          audio: { enabled: true, models: [{ provider: "groq" }] },
+          video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      agentDir: dir,
+      providers: {
+        openai: {
+          id: "openai",
+          describeImage: async () => ({ text: "image ok" }),
+        },
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "audio should not run" }),
+        },
+        google: {
+          id: "google",
+          describeVideo: async () => ({ text: "video ok" }),
+        },
+      },
+    });
+
+    const placeholder =
+      "[Voice note could not be transcribed because the audio attachment was too small]";
+
+    expect(result.appliedImage).toBe(true);
+    expect(result.appliedAudio).toBe(true);
+    expect(result.appliedVideo).toBe(true);
+    expect(ctx.Body).toBe(
+      [
+        "[Image]\nDescription:\nimage ok",
+        `[Audio]\nTranscript:\n${placeholder}`,
+        "[Video]\nDescription:\nvideo ok",
+      ].join("\n\n"),
+    );
+    expect(ctx.Transcript).toBe(placeholder);
+    expect(ctx.CommandBody).toBe(placeholder);
+    expect(ctx.BodyForCommands).toBe(placeholder);
+  });
+
  it("treats text-like attachments as CSV (comma wins over tabs)", async () => {
    const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
    const csvPath = await createTempMediaFile({
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -48,6 +48,8 @@ export type ApplyMediaUnderstandingResult = {
 };

 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EMPTY_VOICE_NOTE_PLACEHOLDER =
+  "[Voice note could not be transcribed because the audio attachment was too small]";
 const EXTRA_TEXT_MIMES = [
  "application/xml",
  "text/xml",
@@ -306,6 +308,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
  return TEXT_EXT_MIME.get(ext);
 }

+function buildSyntheticSkippedAudioOutputs(
+  decisions: MediaUnderstandingDecision[],
+): MediaUnderstandingOutput[] {
+  const audioDecision = decisions.find((decision) => decision.capability === "audio");
+  if (!audioDecision) {
+    return [];
+  }
+  return audioDecision.attachments.flatMap((attachment) => {
+    const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
+      attempt.reason?.trim().startsWith("tooSmall"),
+    );
+    if (!hasTooSmallAttempt) {
+      return [];
+    }
+    return [
+      {
+        kind: "audio.transcription" as const,
+        attachmentIndex: attachment.attachmentIndex,
+        text: EMPTY_VOICE_NOTE_PLACEHOLDER,
+        provider: "openclaw",
+        model: "synthetic-empty-audio",
+      },
+    ];
+  });
+}
+
 function isBinaryMediaMime(mime?: string): boolean {
  if (!mime) {
    return false;
@@ -527,6 +555,54 @@ export async function applyMediaUnderstanding(params: {
      decisions.push(entry.decision);
    }

+    const audioOutputAttachmentIndexes = new Set(
+      outputs
+        .filter((output) => output.kind === "audio.transcription")
+        .map((output) => output.attachmentIndex),
+    );
+    const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
+      (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
+    );
+
+    // Merge synthetic placeholders into the audio slice while preserving the
+    // selected audio attachment order from `runCapability()` / `attachments.prefer`.
+    // When audio produced no real outputs, insert the synthetic slice at the
+    // audio capability slot (before video) instead of appending at the end.
+    if (syntheticSkippedAudioOutputs.length > 0) {
+      const audioDecision = decisions.find((decision) => decision.capability === "audio");
+      const audioAttachmentOrder =
+        audioDecision?.attachments.map((attachment) => attachment.attachmentIndex) ?? [];
+      const audioOutputsByAttachmentIndex = new Map<number, MediaUnderstandingOutput>();
+      for (const output of outputs) {
+        if (output.kind === "audio.transcription") {
+          audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
+        }
+      }
+      for (const output of syntheticSkippedAudioOutputs) {
+        audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
+      }
+      const mergedAudio = audioAttachmentOrder
+        .map((attachmentIndex) => audioOutputsByAttachmentIndex.get(attachmentIndex))
+        .filter((output): output is MediaUnderstandingOutput => Boolean(output));
+
+      const firstAudioIdx = outputs.findIndex((o) => o.kind === "audio.transcription");
+      if (firstAudioIdx >= 0) {
+        const before = outputs.slice(0, firstAudioIdx);
+        const afterLastAudio = outputs.slice(
+          outputs.reduce(
+            (last, o, i) => (o.kind === "audio.transcription" ? i : last),
+            firstAudioIdx,
+          ) + 1,
+        );
+        outputs.length = 0;
+        outputs.push(...before, ...mergedAudio, ...afterLastAudio);
+      } else {
+        const firstVideoIdx = outputs.findIndex((o) => o.kind === "video.description");
+        const audioInsertIdx = firstVideoIdx >= 0 ? firstVideoIdx : outputs.length;
+        outputs.splice(audioInsertIdx, 0, ...mergedAudio);
+      }
+    }
+
    if (decisions.length > 0) {
      ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
    }
@@ -560,9 +636,19 @@ export async function applyMediaUnderstanding(params: {
      }
      ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
    }
+    // Only skip file extraction for attachments that have a real (non-synthetic)
+    // audio transcription. Synthetic placeholders should not prevent file extraction
+    // for tiny audio-MIME files that could be recovered as text via forcedTextMime.
+    const syntheticAudioIndexes = new Set(
+      syntheticSkippedAudioOutputs.map((o) => o.attachmentIndex),
+    );
    const audioAttachmentIndexes = new Set(
      outputs
-        .filter((output) => output.kind === "audio.transcription")
+        .filter(
+          (output) =>
+            output.kind === "audio.transcription" &&
+            !syntheticAudioIndexes.has(output.attachmentIndex),
+        )
        .map((output) => output.attachmentIndex),
    );
    const fileBlocks = await extractFileBlocks({
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
    });
    expect(body).toBe("[Image]\nDescription:\na cat");
  });
+
+  it("labels audio transcripts by their attachment order", () => {
+    const body = formatMediaUnderstandingBody({
+      outputs: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "first clip was silent",
+          provider: "openclaw",
+        },
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 1,
+          text: "second clip has speech",
+          provider: "groq",
+        },
+      ],
+    });
+    expect(body).toBe(
+      [
+        "[Audio 1/2]\nTranscript:\nfirst clip was silent",
+        "[Audio 2/2]\nTranscript:\nsecond clip has speech",
+      ].join("\n\n"),
+    );
+  });
 });