fix: add placeholder transcript for silent voice notes (#49131)

* fix: add placeholder transcript for silent voice notes

* fix: handle placeholder transcripts per skipped attachment

* fix: preserve synthetic transcript attachment order

* fix: scope synthetic audio merge to audio slice only, preserve cross-capability and prefer ordering

Replace the global outputs.sort() with a targeted merge that:
1. Only sorts within the audio output slice (real + synthetic),
   preserving CAPABILITY_ORDER and per-capability attachments.prefer
   ordering for non-audio outputs.
2. Excludes synthetic placeholder indexes from audioAttachmentIndexes
   used by extractFileBlocks, so tiny audio-MIME files with text
   extensions can still be recovered via forcedTextMime.

Adds mergeAudioOutputsPreservingAttachmentOrder helper.

* fix: remove unused function and use toSorted() for oxlint compliance

* fix(media-understanding): preserve selected audio order for synthetic placeholders

- merge synthetic skipped-audio placeholders using audio decision order
  instead of raw attachmentIndex sorting, preserving attachments.prefer
- insert synthetic-only audio outputs at the audio capability slot
  (before video) when no real audio outputs were produced

* fix(media-understanding): use neutral too-small placeholder text

Clarify that this synthetic transcript path is triggered by attachment size,
not by a silence/no-speech detection result.

* test(media-understanding): update too-small audio placeholder expectations

* test(media-understanding): cover mixed too-small audio placeholder

* test(media-understanding): cover too-small audio context

* fix(tasks): preserve visible task title before internal context

* Revert "fix(tasks): preserve visible task title before internal context"

This reverts commit dc536fb4d3c8a01168de5d05e8562193dd68a88e.

---------

Co-authored-by: Eulices Lopez <eulices@users.noreply.github.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Eulices
2026-04-26 00:14:01 -04:00
committed by GitHub
parent bcc9fc4cf5
commit 008e4ca81f
5 changed files with 287 additions and 5 deletions

View File

@@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai
keeping those envelopes out of ACP transcripts.
- TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993.
- Agents/Gateway: steer agent-driven config edits and restarts through the owner-only `gateway` tool, document `config.schema.lookup` as the field-doc source, and warn against using `gateway stop && gateway start` as a restart substitute on macOS. Fixes #71929. Thanks @ygc3817922006-sketch.
- Media understanding/audio: inject a deterministic transcript placeholder for too-small voice notes so agents do not hallucinate transcription or provider failures. Fixes #48944. Thanks @eulicesl.
- Providers/vLLM: send Nemotron 3 chat-template kwargs when thinking is off
and honor configured `params.chat_template_kwargs` for OpenAI-compatible
completions, so vLLM/Nemotron replies stay visible instead of becoming

View File

@@ -130,7 +130,7 @@ Recommended defaults:
Rules:
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription.
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription; inbound reply context receives a deterministic placeholder transcript so the agent knows the note was too small.
- If the model returns more than `maxChars`, output is trimmed.
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
- If the active primary image model already supports vision natively, OpenClaw

View File

@@ -459,8 +459,7 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
});
it("skips URL-only audio when remote file is too small", async () => {
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
it("injects a placeholder transcript when URL-only audio is too small", async () => {
mockedFetchRemoteMedia.mockResolvedValueOnce({
buffer: Buffer.alloc(100),
contentType: "audio/ogg",
@@ -499,7 +498,66 @@ describe("applyMediaUnderstanding", () => {
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(false);
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note could not be transcribed because the audio attachment was too small]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note could not be transcribed because the audio attachment was too small]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
);
});
it("injects a placeholder transcript when local-path audio is too small", async () => {
const ctx = await createAudioCtx({
fileName: "tiny.ogg",
mediaType: "audio/ogg",
content: Buffer.alloc(100),
});
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: { id: "groq", transcribeAudio },
},
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(true);
expect(result.outputs).toEqual([
expect.objectContaining({
kind: "audio.transcription",
text: "[Voice note could not be transcribed because the audio attachment was too small]",
provider: "openclaw",
model: "synthetic-empty-audio",
}),
]);
expect(ctx.Transcript).toBe(
"[Voice note could not be transcribed because the audio attachment was too small]",
);
expect(ctx.Body).toBe(
"[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
);
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {
@@ -969,6 +1027,56 @@ describe("applyMediaUnderstanding", () => {
);
});
it("adds placeholder for tooSmall audio while preserving real transcript for valid audio", async () => {
const dir = await createTempMediaDir();
const validAudio = createSafeAudioFixtureBuffer(2048);
const tinyAudio = Buffer.alloc(100);
const validPath = path.join(dir, "valid.ogg");
const tinyPath = path.join(dir, "tiny.ogg");
await fs.writeFile(validPath, validAudio);
await fs.writeFile(tinyPath, tinyAudio);
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPaths: [validPath, tinyPath],
MediaTypes: ["audio/ogg", "audio/ogg"],
};
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
attachments: { mode: "all", maxAttachments: 2 },
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async (req) => ({ text: `transcribed ${req.fileName ?? "unknown"}` }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toContain("transcribed valid.ogg");
expect(ctx.Transcript).toContain(
"[Voice note could not be transcribed because the audio attachment was too small]",
);
expect(ctx.Body).toContain("[Audio 1/2]");
expect(ctx.Body).toContain("transcribed valid.ogg");
expect(ctx.Body).toContain("[Audio 2/2]");
expect(ctx.Body).toContain(
"[Voice note could not be transcribed because the audio attachment was too small]",
);
});
it("orders mixed media outputs as image, audio, video", async () => {
const dir = await createTempMediaDir();
const imagePath = path.join(dir, "photo.jpg");
@@ -1028,6 +1136,68 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.BodyForCommands).toBe("audio ok");
});
it("orders synthetic too-small audio output between image and video", async () => {
const dir = await createTempMediaDir();
const imagePath = path.join(dir, "photo.jpg");
const audioPath = path.join(dir, "silent.ogg");
const videoPath = path.join(dir, "clip.mp4");
await fs.writeFile(imagePath, "image-bytes");
await fs.writeFile(audioPath, Buffer.alloc(100));
await fs.writeFile(videoPath, "video-bytes");
const ctx: MsgContext = {
Body: "<media:mixed>",
MediaPaths: [imagePath, audioPath, videoPath],
MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"],
};
const cfg: OpenClawConfig = {
tools: {
media: {
image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.4" }] },
audio: { enabled: true, models: [{ provider: "groq" }] },
video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] },
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
agentDir: dir,
providers: {
openai: {
id: "openai",
describeImage: async () => ({ text: "image ok" }),
},
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "audio should not run" }),
},
google: {
id: "google",
describeVideo: async () => ({ text: "video ok" }),
},
},
});
const placeholder =
"[Voice note could not be transcribed because the audio attachment was too small]";
expect(result.appliedImage).toBe(true);
expect(result.appliedAudio).toBe(true);
expect(result.appliedVideo).toBe(true);
expect(ctx.Body).toBe(
[
"[Image]\nDescription:\nimage ok",
`[Audio]\nTranscript:\n${placeholder}`,
"[Video]\nDescription:\nvideo ok",
].join("\n\n"),
);
expect(ctx.Transcript).toBe(placeholder);
expect(ctx.CommandBody).toBe(placeholder);
expect(ctx.BodyForCommands).toBe(placeholder);
});
it("treats text-like attachments as CSV (comma wins over tabs)", async () => {
const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
const csvPath = await createTempMediaFile({

View File

@@ -48,6 +48,8 @@ export type ApplyMediaUnderstandingResult = {
};
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
const EMPTY_VOICE_NOTE_PLACEHOLDER =
"[Voice note could not be transcribed because the audio attachment was too small]";
const EXTRA_TEXT_MIMES = [
"application/xml",
"text/xml",
@@ -306,6 +308,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
return TEXT_EXT_MIME.get(ext);
}
function buildSyntheticSkippedAudioOutputs(
decisions: MediaUnderstandingDecision[],
): MediaUnderstandingOutput[] {
const audioDecision = decisions.find((decision) => decision.capability === "audio");
if (!audioDecision) {
return [];
}
return audioDecision.attachments.flatMap((attachment) => {
const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
attempt.reason?.trim().startsWith("tooSmall"),
);
if (!hasTooSmallAttempt) {
return [];
}
return [
{
kind: "audio.transcription" as const,
attachmentIndex: attachment.attachmentIndex,
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
provider: "openclaw",
model: "synthetic-empty-audio",
},
];
});
}
function isBinaryMediaMime(mime?: string): boolean {
if (!mime) {
return false;
@@ -527,6 +555,54 @@ export async function applyMediaUnderstanding(params: {
decisions.push(entry.decision);
}
const audioOutputAttachmentIndexes = new Set(
outputs
.filter((output) => output.kind === "audio.transcription")
.map((output) => output.attachmentIndex),
);
const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
(output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
);
// Merge synthetic placeholders into the audio slice while preserving the
// selected audio attachment order from `runCapability()` / `attachments.prefer`.
// When audio produced no real outputs, insert the synthetic slice at the
// audio capability slot (before video) instead of appending at the end.
if (syntheticSkippedAudioOutputs.length > 0) {
const audioDecision = decisions.find((decision) => decision.capability === "audio");
const audioAttachmentOrder =
audioDecision?.attachments.map((attachment) => attachment.attachmentIndex) ?? [];
const audioOutputsByAttachmentIndex = new Map<number, MediaUnderstandingOutput>();
for (const output of outputs) {
if (output.kind === "audio.transcription") {
audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
}
}
for (const output of syntheticSkippedAudioOutputs) {
audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
}
const mergedAudio = audioAttachmentOrder
.map((attachmentIndex) => audioOutputsByAttachmentIndex.get(attachmentIndex))
.filter((output): output is MediaUnderstandingOutput => Boolean(output));
const firstAudioIdx = outputs.findIndex((o) => o.kind === "audio.transcription");
if (firstAudioIdx >= 0) {
const before = outputs.slice(0, firstAudioIdx);
const afterLastAudio = outputs.slice(
outputs.reduce(
(last, o, i) => (o.kind === "audio.transcription" ? i : last),
firstAudioIdx,
) + 1,
);
outputs.length = 0;
outputs.push(...before, ...mergedAudio, ...afterLastAudio);
} else {
const firstVideoIdx = outputs.findIndex((o) => o.kind === "video.description");
const audioInsertIdx = firstVideoIdx >= 0 ? firstVideoIdx : outputs.length;
outputs.splice(audioInsertIdx, 0, ...mergedAudio);
}
}
if (decisions.length > 0) {
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
}
@@ -560,9 +636,19 @@ export async function applyMediaUnderstanding(params: {
}
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
}
// Only skip file extraction for attachments that have a real (non-synthetic)
// audio transcription. Synthetic placeholders should not prevent file extraction
// for tiny audio-MIME files that could be recovered as text via forcedTextMime.
const syntheticAudioIndexes = new Set(
syntheticSkippedAudioOutputs.map((o) => o.attachmentIndex),
);
const audioAttachmentIndexes = new Set(
outputs
.filter((output) => output.kind === "audio.transcription")
.filter(
(output) =>
output.kind === "audio.transcription" &&
!syntheticAudioIndexes.has(output.attachmentIndex),
)
.map((output) => output.attachmentIndex),
);
const fileBlocks = await extractFileBlocks({

View File

@@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
});
expect(body).toBe("[Image]\nDescription:\na cat");
});
it("labels audio transcripts by their attachment order", () => {
const body = formatMediaUnderstandingBody({
outputs: [
{
kind: "audio.transcription",
attachmentIndex: 0,
text: "first clip was silent",
provider: "openclaw",
},
{
kind: "audio.transcription",
attachmentIndex: 1,
text: "second clip has speech",
provider: "groq",
},
],
});
expect(body).toBe(
[
"[Audio 1/2]\nTranscript:\nfirst clip was silent",
"[Audio 2/2]\nTranscript:\nsecond clip has speech",
].join("\n\n"),
);
});
});