mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:40:42 +00:00
fix: add placeholder transcript for silent voice notes (#49131)
* fix: add placeholder transcript for silent voice notes * fix: handle placeholder transcripts per skipped attachment * fix: preserve synthetic transcript attachment order * fix: scope synthetic audio merge to audio slice only, preserve cross-capability and prefer ordering Replace the global outputs.sort() with a targeted merge that: 1. Only sorts within the audio output slice (real + synthetic), preserving CAPABILITY_ORDER and per-capability attachments.prefer ordering for non-audio outputs. 2. Excludes synthetic placeholder indexes from audioAttachmentIndexes used by extractFileBlocks, so tiny audio-MIME files with text extensions can still be recovered via forcedTextMime. Adds mergeAudioOutputsPreservingAttachmentOrder helper. * fix: remove unused function and use toSorted() for oxlint compliance * fix(media-understanding): preserve selected audio order for synthetic placeholders - merge synthetic skipped-audio placeholders using audio decision order instead of raw attachmentIndex sorting, preserving attachments.prefer - insert synthetic-only audio outputs at the audio capability slot (before video) when no real audio outputs were produced * fix(media-understanding): use neutral too-small placeholder text Clarify that this synthetic transcript path is triggered by attachment size, not by a silence/no-speech detection result. * test(media-understanding): update too-small audio placeholder expectations * test(media-understanding): cover mixed too-small audio placeholder * test(media-understanding): cover too-small audio context * fix(tasks): preserve visible task title before internal context * Revert "fix(tasks): preserve visible task title before internal context" This reverts commit dc536fb4d3c8a01168de5d05e8562193dd68a88e. --------- Co-authored-by: Eulices Lopez <eulices@users.noreply.github.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai
|
||||
keeping those envelopes out of ACP transcripts.
|
||||
- TTS/status: show configured TTS model, voice, and sanitized custom endpoint in `/status`, preserve OpenAI-compatible TTS instructions on custom endpoints, and retry empty Microsoft/Edge TTS output once. Addresses #46602, #47232, and #43936. Thanks @leekuangtao, @Huntterxx, and @rex993.
|
||||
- Agents/Gateway: steer agent-driven config edits and restarts through the owner-only `gateway` tool, document `config.schema.lookup` as the field-doc source, and warn against using `gateway stop && gateway start` as a restart substitute on macOS. Fixes #71929. Thanks @ygc3817922006-sketch.
|
||||
- Media understanding/audio: inject a deterministic transcript placeholder for too-small voice notes so agents do not hallucinate transcription or provider failures. Fixes #48944. Thanks @eulicesl.
|
||||
- Providers/vLLM: send Nemotron 3 chat-template kwargs when thinking is off
|
||||
and honor configured `params.chat_template_kwargs` for OpenAI-compatible
|
||||
completions, so vLLM/Nemotron replies stay visible instead of becoming
|
||||
|
||||
@@ -130,7 +130,7 @@ Recommended defaults:
|
||||
Rules:
|
||||
|
||||
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
|
||||
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription.
|
||||
- Audio files smaller than **1024 bytes** are treated as empty/corrupt and skipped before provider/CLI transcription; inbound reply context receives a deterministic placeholder transcript so the agent knows the note was too small.
|
||||
- If the model returns more than `maxChars`, output is trimmed.
|
||||
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
||||
- If the active primary image model already supports vision natively, OpenClaw
|
||||
|
||||
@@ -459,8 +459,7 @@ describe("applyMediaUnderstanding", () => {
|
||||
expect(ctx.Body).toBe("[Audio]\nTranscript:\nwhatsapp transcript");
|
||||
});
|
||||
|
||||
it("skips URL-only audio when remote file is too small", async () => {
|
||||
// Override the default mock to return a tiny buffer (below MIN_AUDIO_FILE_BYTES)
|
||||
it("injects a placeholder transcript when URL-only audio is too small", async () => {
|
||||
mockedFetchRemoteMedia.mockResolvedValueOnce({
|
||||
buffer: Buffer.alloc(100),
|
||||
contentType: "audio/ogg",
|
||||
@@ -499,7 +498,66 @@ describe("applyMediaUnderstanding", () => {
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(false);
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.outputs).toEqual([
|
||||
expect.objectContaining({
|
||||
kind: "audio.transcription",
|
||||
text: "[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
}),
|
||||
]);
|
||||
expect(ctx.Transcript).toBe(
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
expect(ctx.Body).toBe(
|
||||
"[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
});
|
||||
|
||||
it("injects a placeholder transcript when local-path audio is too small", async () => {
|
||||
const ctx = await createAudioCtx({
|
||||
fileName: "tiny.ogg",
|
||||
mediaType: "audio/ogg",
|
||||
content: Buffer.alloc(100),
|
||||
});
|
||||
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: { id: "groq", transcribeAudio },
|
||||
},
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.outputs).toEqual([
|
||||
expect.objectContaining({
|
||||
kind: "audio.transcription",
|
||||
text: "[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
}),
|
||||
]);
|
||||
expect(ctx.Transcript).toBe(
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
expect(ctx.Body).toBe(
|
||||
"[Audio]\nTranscript:\n[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
});
|
||||
|
||||
it("skips audio transcription when attachment exceeds maxBytes", async () => {
|
||||
@@ -969,6 +1027,56 @@ describe("applyMediaUnderstanding", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("adds placeholder for tooSmall audio while preserving real transcript for valid audio", async () => {
|
||||
const dir = await createTempMediaDir();
|
||||
const validAudio = createSafeAudioFixtureBuffer(2048);
|
||||
const tinyAudio = Buffer.alloc(100);
|
||||
const validPath = path.join(dir, "valid.ogg");
|
||||
const tinyPath = path.join(dir, "tiny.ogg");
|
||||
await fs.writeFile(validPath, validAudio);
|
||||
await fs.writeFile(tinyPath, tinyAudio);
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPaths: [validPath, tinyPath],
|
||||
MediaTypes: ["audio/ogg", "audio/ogg"],
|
||||
};
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
attachments: { mode: "all", maxAttachments: 2 },
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async (req) => ({ text: `transcribed ${req.fileName ?? "unknown"}` }),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(ctx.Transcript).toContain("transcribed valid.ogg");
|
||||
expect(ctx.Transcript).toContain(
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
expect(ctx.Body).toContain("[Audio 1/2]");
|
||||
expect(ctx.Body).toContain("transcribed valid.ogg");
|
||||
expect(ctx.Body).toContain("[Audio 2/2]");
|
||||
expect(ctx.Body).toContain(
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]",
|
||||
);
|
||||
});
|
||||
|
||||
it("orders mixed media outputs as image, audio, video", async () => {
|
||||
const dir = await createTempMediaDir();
|
||||
const imagePath = path.join(dir, "photo.jpg");
|
||||
@@ -1028,6 +1136,68 @@ describe("applyMediaUnderstanding", () => {
|
||||
expect(ctx.BodyForCommands).toBe("audio ok");
|
||||
});
|
||||
|
||||
it("orders synthetic too-small audio output between image and video", async () => {
|
||||
const dir = await createTempMediaDir();
|
||||
const imagePath = path.join(dir, "photo.jpg");
|
||||
const audioPath = path.join(dir, "silent.ogg");
|
||||
const videoPath = path.join(dir, "clip.mp4");
|
||||
await fs.writeFile(imagePath, "image-bytes");
|
||||
await fs.writeFile(audioPath, Buffer.alloc(100));
|
||||
await fs.writeFile(videoPath, "video-bytes");
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:mixed>",
|
||||
MediaPaths: [imagePath, audioPath, videoPath],
|
||||
MediaTypes: ["image/jpeg", "audio/ogg", "video/mp4"],
|
||||
};
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
image: { enabled: true, models: [{ provider: "openai", model: "gpt-5.4" }] },
|
||||
audio: { enabled: true, models: [{ provider: "groq" }] },
|
||||
video: { enabled: true, models: [{ provider: "google", model: "gemini-3" }] },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
agentDir: dir,
|
||||
providers: {
|
||||
openai: {
|
||||
id: "openai",
|
||||
describeImage: async () => ({ text: "image ok" }),
|
||||
},
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => ({ text: "audio should not run" }),
|
||||
},
|
||||
google: {
|
||||
id: "google",
|
||||
describeVideo: async () => ({ text: "video ok" }),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const placeholder =
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]";
|
||||
|
||||
expect(result.appliedImage).toBe(true);
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(result.appliedVideo).toBe(true);
|
||||
expect(ctx.Body).toBe(
|
||||
[
|
||||
"[Image]\nDescription:\nimage ok",
|
||||
`[Audio]\nTranscript:\n${placeholder}`,
|
||||
"[Video]\nDescription:\nvideo ok",
|
||||
].join("\n\n"),
|
||||
);
|
||||
expect(ctx.Transcript).toBe(placeholder);
|
||||
expect(ctx.CommandBody).toBe(placeholder);
|
||||
expect(ctx.BodyForCommands).toBe(placeholder);
|
||||
});
|
||||
|
||||
it("treats text-like attachments as CSV (comma wins over tabs)", async () => {
|
||||
const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
|
||||
const csvPath = await createTempMediaFile({
|
||||
|
||||
@@ -48,6 +48,8 @@ export type ApplyMediaUnderstandingResult = {
|
||||
};
|
||||
|
||||
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
||||
const EMPTY_VOICE_NOTE_PLACEHOLDER =
|
||||
"[Voice note could not be transcribed because the audio attachment was too small]";
|
||||
const EXTRA_TEXT_MIMES = [
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
@@ -306,6 +308,32 @@ function resolveTextMimeFromName(name?: string): string | undefined {
|
||||
return TEXT_EXT_MIME.get(ext);
|
||||
}
|
||||
|
||||
function buildSyntheticSkippedAudioOutputs(
|
||||
decisions: MediaUnderstandingDecision[],
|
||||
): MediaUnderstandingOutput[] {
|
||||
const audioDecision = decisions.find((decision) => decision.capability === "audio");
|
||||
if (!audioDecision) {
|
||||
return [];
|
||||
}
|
||||
return audioDecision.attachments.flatMap((attachment) => {
|
||||
const hasTooSmallAttempt = attachment.attempts.some((attempt) =>
|
||||
attempt.reason?.trim().startsWith("tooSmall"),
|
||||
);
|
||||
if (!hasTooSmallAttempt) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
kind: "audio.transcription" as const,
|
||||
attachmentIndex: attachment.attachmentIndex,
|
||||
text: EMPTY_VOICE_NOTE_PLACEHOLDER,
|
||||
provider: "openclaw",
|
||||
model: "synthetic-empty-audio",
|
||||
},
|
||||
];
|
||||
});
|
||||
}
|
||||
|
||||
function isBinaryMediaMime(mime?: string): boolean {
|
||||
if (!mime) {
|
||||
return false;
|
||||
@@ -527,6 +555,54 @@ export async function applyMediaUnderstanding(params: {
|
||||
decisions.push(entry.decision);
|
||||
}
|
||||
|
||||
const audioOutputAttachmentIndexes = new Set(
|
||||
outputs
|
||||
.filter((output) => output.kind === "audio.transcription")
|
||||
.map((output) => output.attachmentIndex),
|
||||
);
|
||||
const syntheticSkippedAudioOutputs = buildSyntheticSkippedAudioOutputs(decisions).filter(
|
||||
(output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex),
|
||||
);
|
||||
|
||||
// Merge synthetic placeholders into the audio slice while preserving the
|
||||
// selected audio attachment order from `runCapability()` / `attachments.prefer`.
|
||||
// When audio produced no real outputs, insert the synthetic slice at the
|
||||
// audio capability slot (before video) instead of appending at the end.
|
||||
if (syntheticSkippedAudioOutputs.length > 0) {
|
||||
const audioDecision = decisions.find((decision) => decision.capability === "audio");
|
||||
const audioAttachmentOrder =
|
||||
audioDecision?.attachments.map((attachment) => attachment.attachmentIndex) ?? [];
|
||||
const audioOutputsByAttachmentIndex = new Map<number, MediaUnderstandingOutput>();
|
||||
for (const output of outputs) {
|
||||
if (output.kind === "audio.transcription") {
|
||||
audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
|
||||
}
|
||||
}
|
||||
for (const output of syntheticSkippedAudioOutputs) {
|
||||
audioOutputsByAttachmentIndex.set(output.attachmentIndex, output);
|
||||
}
|
||||
const mergedAudio = audioAttachmentOrder
|
||||
.map((attachmentIndex) => audioOutputsByAttachmentIndex.get(attachmentIndex))
|
||||
.filter((output): output is MediaUnderstandingOutput => Boolean(output));
|
||||
|
||||
const firstAudioIdx = outputs.findIndex((o) => o.kind === "audio.transcription");
|
||||
if (firstAudioIdx >= 0) {
|
||||
const before = outputs.slice(0, firstAudioIdx);
|
||||
const afterLastAudio = outputs.slice(
|
||||
outputs.reduce(
|
||||
(last, o, i) => (o.kind === "audio.transcription" ? i : last),
|
||||
firstAudioIdx,
|
||||
) + 1,
|
||||
);
|
||||
outputs.length = 0;
|
||||
outputs.push(...before, ...mergedAudio, ...afterLastAudio);
|
||||
} else {
|
||||
const firstVideoIdx = outputs.findIndex((o) => o.kind === "video.description");
|
||||
const audioInsertIdx = firstVideoIdx >= 0 ? firstVideoIdx : outputs.length;
|
||||
outputs.splice(audioInsertIdx, 0, ...mergedAudio);
|
||||
}
|
||||
}
|
||||
|
||||
if (decisions.length > 0) {
|
||||
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
|
||||
}
|
||||
@@ -560,9 +636,19 @@ export async function applyMediaUnderstanding(params: {
|
||||
}
|
||||
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
|
||||
}
|
||||
// Only skip file extraction for attachments that have a real (non-synthetic)
|
||||
// audio transcription. Synthetic placeholders should not prevent file extraction
|
||||
// for tiny audio-MIME files that could be recovered as text via forcedTextMime.
|
||||
const syntheticAudioIndexes = new Set(
|
||||
syntheticSkippedAudioOutputs.map((o) => o.attachmentIndex),
|
||||
);
|
||||
const audioAttachmentIndexes = new Set(
|
||||
outputs
|
||||
.filter((output) => output.kind === "audio.transcription")
|
||||
.filter(
|
||||
(output) =>
|
||||
output.kind === "audio.transcription" &&
|
||||
!syntheticAudioIndexes.has(output.attachmentIndex),
|
||||
)
|
||||
.map((output) => output.attachmentIndex),
|
||||
);
|
||||
const fileBlocks = await extractFileBlocks({
|
||||
|
||||
@@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => {
|
||||
});
|
||||
expect(body).toBe("[Image]\nDescription:\na cat");
|
||||
});
|
||||
|
||||
it("labels audio transcripts by their attachment order", () => {
|
||||
const body = formatMediaUnderstandingBody({
|
||||
outputs: [
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 0,
|
||||
text: "first clip was silent",
|
||||
provider: "openclaw",
|
||||
},
|
||||
{
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: 1,
|
||||
text: "second clip has speech",
|
||||
provider: "groq",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(body).toBe(
|
||||
[
|
||||
"[Audio 1/2]\nTranscript:\nfirst clip was silent",
|
||||
"[Audio 2/2]\nTranscript:\nsecond clip has speech",
|
||||
].join("\n\n"),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user