mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-17 07:40:44 +00:00
* fix: add placeholder transcript for silent voice notes * fix: handle placeholder transcripts per skipped attachment * fix: preserve synthetic transcript attachment order * fix: scope synthetic audio merge to audio slice only, preserve cross-capability and prefer ordering Replace the global outputs.sort() with a targeted merge that: 1. Only sorts within the audio output slice (real + synthetic), preserving CAPABILITY_ORDER and per-capability attachments.prefer ordering for non-audio outputs. 2. Excludes synthetic placeholder indexes from audioAttachmentIndexes used by extractFileBlocks, so tiny audio-MIME files with text extensions can still be recovered via forcedTextMime. Adds mergeAudioOutputsPreservingAttachmentOrder helper. * fix: remove unused function and use toSorted() for oxlint compliance * fix(media-understanding): preserve selected audio order for synthetic placeholders - merge synthetic skipped-audio placeholders using audio decision order instead of raw attachmentIndex sorting, preserving attachments.prefer - insert synthetic-only audio outputs at the audio capability slot (before video) when no real audio outputs were produced * fix(media-understanding): use neutral too-small placeholder text Clarify that this synthetic transcript path is triggered by attachment size, not by a silence/no-speech detection result. * test(media-understanding): update too-small audio placeholder expectations * test(media-understanding): cover mixed too-small audio placeholder * test(media-understanding): cover too-small audio context * fix(tasks): preserve visible task title before internal context * Revert "fix(tasks): preserve visible task title before internal context" This reverts commit dc536fb4d3c8a01168de5d05e8562193dd68a88e. --------- Co-authored-by: Eulices Lopez <eulices@users.noreply.github.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
117 lines
3.0 KiB
TypeScript
117 lines
3.0 KiB
TypeScript
import { describe, expect, it } from "vitest";
|
|
import { formatMediaUnderstandingBody } from "./format.js";
|
|
|
|
describe("formatMediaUnderstandingBody", () => {
|
|
it("replaces placeholder body with transcript", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
body: "<media:audio>",
|
|
outputs: [
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 0,
|
|
text: "hello world",
|
|
provider: "groq",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe("[Audio]\nTranscript:\nhello world");
|
|
});
|
|
|
|
it("includes user text when body is meaningful", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
body: "caption here",
|
|
outputs: [
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 0,
|
|
text: "transcribed",
|
|
provider: "groq",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
|
});
|
|
|
|
it("strips leading media placeholders from user text", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
body: "<media:audio> caption here",
|
|
outputs: [
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 0,
|
|
text: "transcribed",
|
|
provider: "groq",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
|
});
|
|
|
|
it("keeps user text once when multiple outputs exist", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
body: "caption here",
|
|
outputs: [
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 0,
|
|
text: "audio text",
|
|
provider: "groq",
|
|
},
|
|
{
|
|
kind: "video.description",
|
|
attachmentIndex: 1,
|
|
text: "video text",
|
|
provider: "google",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe(
|
|
[
|
|
"User text:\ncaption here",
|
|
"[Audio]\nTranscript:\naudio text",
|
|
"[Video]\nDescription:\nvideo text",
|
|
].join("\n\n"),
|
|
);
|
|
});
|
|
|
|
it("formats image outputs", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
body: "<media:image>",
|
|
outputs: [
|
|
{
|
|
kind: "image.description",
|
|
attachmentIndex: 0,
|
|
text: "a cat",
|
|
provider: "openai",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe("[Image]\nDescription:\na cat");
|
|
});
|
|
|
|
it("labels audio transcripts by their attachment order", () => {
|
|
const body = formatMediaUnderstandingBody({
|
|
outputs: [
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 0,
|
|
text: "first clip was silent",
|
|
provider: "openclaw",
|
|
},
|
|
{
|
|
kind: "audio.transcription",
|
|
attachmentIndex: 1,
|
|
text: "second clip has speech",
|
|
provider: "groq",
|
|
},
|
|
],
|
|
});
|
|
expect(body).toBe(
|
|
[
|
|
"[Audio 1/2]\nTranscript:\nfirst clip was silent",
|
|
"[Audio 2/2]\nTranscript:\nsecond clip has speech",
|
|
].join("\n\n"),
|
|
);
|
|
});
|
|
});
|