mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-18 16:54:46 +00:00
fix(webchat): render tts audio command replies
This commit is contained in:
committed by
Peter Steinberger
parent
686b93e5c7
commit
817dca5ae9
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Cron/Codex: default exact-command scheduled agent turns to lightweight bootstrap context so automation runs the command before loading workspace identity or memory context.
|
||||
- Codex plugin/Gateway: strip unpaired UTF-16 surrogates from Codex app-server JSON-RPC payloads and let stale reply-work recovery abort stalled reply runs, preventing malformed media turns from wedging gateway lanes.
|
||||
- Codex app server: force OAuth refresh requests to perform a real token refresh instead of reusing unchanged inherited auth-profile tokens after refresh failures. (#80738) Thanks @simplyclever914.
|
||||
- Control UI/WebChat: render `/tts audio` replies as playable audio attachments through the assistant-media ticket path, with structured-audio compatibility for older live payloads. (#81722) Thanks @Conan-Scott.
|
||||
- Bind gateway approval access to requester metadata [AI]. (#81380) Thanks @pgondhi987.
|
||||
- Telegram: let isolated polling drain independent topics, DMs, and status/control commands concurrently while preserving same-lane order. (#81849) Thanks @VACInc.
|
||||
- Doctor/Codex: stop warning that the message tool is unavailable for source-reply paths where OpenClaw grants `message` at runtime, keeping update and doctor output aligned with the OpenAI happy path. Thanks @pashpashpash.
|
||||
|
||||
@@ -38,6 +38,12 @@ describe("buildControlUiCspHeader", () => {
|
||||
expect(csp).not.toContain("img-src 'self' data: blob: https:");
|
||||
});
|
||||
|
||||
it("allows same-origin and inline audio/video playback", () => {
|
||||
const csp = buildControlUiCspHeader();
|
||||
expect(csp).toContain("media-src 'self' data: blob:");
|
||||
expect(csp).not.toContain("media-src 'self' data: blob: https:");
|
||||
});
|
||||
|
||||
it("includes inline script hashes in script-src when provided", () => {
|
||||
const csp = buildControlUiCspHeader({
|
||||
inlineScriptHashes: ["sha256-abc123"],
|
||||
|
||||
@@ -45,6 +45,7 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] }
|
||||
scriptSrc,
|
||||
"style-src 'self' 'unsafe-inline' https://fonts.googleapis.com",
|
||||
"img-src 'self' data: blob:",
|
||||
"media-src 'self' data: blob:",
|
||||
"font-src 'self' https://fonts.gstatic.com",
|
||||
"worker-src 'self'",
|
||||
"connect-src 'self' ws: wss: https://api.openai.com https://tweakcn.com",
|
||||
|
||||
@@ -20,7 +20,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
|
||||
tmpDir = undefined;
|
||||
});
|
||||
|
||||
it("embeds a local audio file as a base64 gateway chat block when it is under localRoots", async () => {
|
||||
it("exposes a local audio file as a media-ticketed attachment when it is under localRoots", async () => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
|
||||
const audioPath = path.join(tmpDir, "clip.mp3");
|
||||
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
@@ -33,15 +33,34 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
|
||||
expect(blocks).toHaveLength(1);
|
||||
const block = blocks[0] as {
|
||||
type?: string;
|
||||
source?: { type?: string; media_type?: string; data?: string };
|
||||
attachment?: { url?: string; kind?: string; label?: string; mimeType?: string };
|
||||
};
|
||||
expect(block.type).toBe("audio");
|
||||
expect(block.source?.type).toBe("base64");
|
||||
expect(block.source?.media_type).toBe("audio/mpeg");
|
||||
expect(block.source?.data?.includes("data:")).toBe(false);
|
||||
expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual(
|
||||
Buffer.from([0xff, 0xfb, 0x90, 0x00]),
|
||||
expect(block.type).toBe("attachment");
|
||||
expect(block.attachment).toEqual({
|
||||
url: fs.realpathSync(audioPath),
|
||||
kind: "audio",
|
||||
label: "clip.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves voice-note metadata on local audio attachments", async () => {
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
|
||||
const audioPath = path.join(tmpDir, "clip.mp3");
|
||||
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
|
||||
const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
|
||||
[{ mediaUrl: audioPath, trustedLocalMedia: true, audioAsVoice: true }],
|
||||
{ localRoots: [tmpDir] },
|
||||
);
|
||||
|
||||
expect(blocks).toHaveLength(1);
|
||||
expect(blocks[0]).toMatchObject({
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
isVoiceNote: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("suppresses reasoning payload audio", async () => {
|
||||
@@ -113,7 +132,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
|
||||
);
|
||||
|
||||
expect(blocks).toHaveLength(1);
|
||||
expect((blocks[0] as { type?: string }).type).toBe("audio");
|
||||
expect((blocks[0] as { type?: string }).type).toBe("attachment");
|
||||
});
|
||||
|
||||
it("drops tool-result file:// URLs with remote hosts before touching the filesystem", async () => {
|
||||
@@ -171,7 +190,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
|
||||
]);
|
||||
|
||||
expect(blocks).toHaveLength(1);
|
||||
expect((blocks[0] as { type?: string }).type).toBe("audio");
|
||||
expect((blocks[0] as { type?: string }).type).toBe("attachment");
|
||||
});
|
||||
|
||||
it("skips local audio when the opened file stat is over the cap", async () => {
|
||||
|
||||
@@ -9,7 +9,7 @@ import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
|
||||
import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js";
|
||||
import { isSuppressedControlReplyText } from "../control-reply-text.js";
|
||||
|
||||
/** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */
|
||||
/** Cap local audio files exposed through assistant media. */
|
||||
const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
|
||||
const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000;
|
||||
const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000;
|
||||
@@ -103,18 +103,16 @@ async function readLocalAudioContentBlockForEmbedding(
|
||||
if (opened.stat.size > MAX_WEBCHAT_AUDIO_BYTES) {
|
||||
return null;
|
||||
}
|
||||
const buf = await opened.handle.readFile();
|
||||
if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
path: opened.realPath,
|
||||
block: {
|
||||
type: "audio",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: mimeTypeForPath(opened.realPath),
|
||||
data: buf.toString("base64"),
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: opened.realPath,
|
||||
kind: "audio",
|
||||
label: path.basename(opened.realPath),
|
||||
mimeType: mimeTypeForPath(opened.realPath),
|
||||
...(payload.audioAsVoice === true ? { isVoiceNote: true } : {}),
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -756,7 +756,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
});
|
||||
|
||||
await waitForAssertion(() => {
|
||||
const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "audio");
|
||||
const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "attachment");
|
||||
const message = assistantUpdate?.message as Record<string, any> | undefined;
|
||||
const content = Array.isArray(message?.content)
|
||||
? (message.content as Array<Record<string, any>>)
|
||||
@@ -764,9 +764,15 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
expect(message?.role).toBe("assistant");
|
||||
expect(message?.idempotencyKey).toBe("idem-agent-audio:assistant-media");
|
||||
expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
|
||||
expect(content[1]?.type).toBe("audio");
|
||||
expect(content[1]?.source?.type).toBe("base64");
|
||||
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
|
||||
expect(content[1]).toEqual({
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: fs.realpathSync(audioPath),
|
||||
kind: "audio",
|
||||
label: "reply.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -820,9 +826,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
expect(message?.role).toBe("assistant");
|
||||
expect(message?.idempotencyKey).toBe("idem-agent-tts:assistant-media");
|
||||
expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
|
||||
expect(content[1]?.type).toBe("audio");
|
||||
expect(content[1]?.source?.type).toBe("base64");
|
||||
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
|
||||
expect(content[1]).toEqual({
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: fs.realpathSync(audioPath),
|
||||
kind: "audio",
|
||||
label: "tts.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
isVoiceNote: true,
|
||||
},
|
||||
});
|
||||
expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain(
|
||||
"This text is already in the model transcript.",
|
||||
);
|
||||
@@ -957,9 +970,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
|
||||
const content = getMessageContent(payload);
|
||||
expect(getMessage(payload)?.role).toBe("assistant");
|
||||
expect(content[0]).toEqual({ type: "text", text: "Command result with TTS." });
|
||||
expect(content[1]?.type).toBe("audio");
|
||||
expect(content[1]?.source?.type).toBe("base64");
|
||||
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
|
||||
expect(content[1]).toEqual({
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: fs.realpathSync(audioPath),
|
||||
kind: "audio",
|
||||
label: "tts.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
isVoiceNote: true,
|
||||
},
|
||||
});
|
||||
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
|
||||
(update) =>
|
||||
typeof update.message === "object" &&
|
||||
|
||||
@@ -89,6 +89,83 @@ describe("message-normalizer", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("normalizes structured base64 audio content blocks as renderable attachments", () => {
|
||||
const result = normalizeMessage({
|
||||
role: "assistant",
|
||||
content: [
|
||||
{
|
||||
type: "audio",
|
||||
label: "tts.mp3",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "audio/mpeg",
|
||||
data: "//uQAA==",
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.content).toEqual([
|
||||
{
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: "data:audio/mpeg;base64,//uQAA==",
|
||||
kind: "audio",
|
||||
label: "tts.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("normalizes structured URL audio content blocks as renderable attachments", () => {
|
||||
const result = normalizeMessage({
|
||||
role: "assistant",
|
||||
content: [
|
||||
{
|
||||
type: "audio",
|
||||
label: "clip.mp3",
|
||||
source: {
|
||||
type: "url",
|
||||
media_type: "audio/mpeg",
|
||||
url: "/tmp/openclaw/clip.mp3",
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.content).toEqual([
|
||||
{
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url: "/tmp/openclaw/clip.mp3",
|
||||
kind: "audio",
|
||||
label: "clip.mp3",
|
||||
mimeType: "audio/mpeg",
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("does not normalize non-assistant structured audio blocks as attachments", () => {
|
||||
const result = normalizeMessage({
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "audio",
|
||||
label: "upload.mp3",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "audio/mpeg",
|
||||
data: "//uQAA==",
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.content).toEqual([]);
|
||||
});
|
||||
|
||||
it("does not reinterpret directive-like user text blocks inside array content", () => {
|
||||
const result = normalizeMessage({
|
||||
role: "user",
|
||||
|
||||
@@ -145,6 +145,58 @@ function inferAttachmentKind(url: string): {
|
||||
return { kind, mimeType, label };
|
||||
}
|
||||
|
||||
function coerceAudioContentBlock(
|
||||
item: Record<string, unknown>,
|
||||
): Extract<MessageContentItem, { type: "attachment" }> | null {
|
||||
if (item.type !== "audio") {
|
||||
return null;
|
||||
}
|
||||
const source = item.source;
|
||||
if (!source || typeof source !== "object" || Array.isArray(source)) {
|
||||
return null;
|
||||
}
|
||||
const sourceRecord = source as Record<string, unknown>;
|
||||
const mediaType =
|
||||
typeof sourceRecord.media_type === "string" &&
|
||||
sourceRecord.media_type.trim().toLowerCase().startsWith("audio/")
|
||||
? sourceRecord.media_type.trim()
|
||||
: "audio/mpeg";
|
||||
if (sourceRecord.type === "base64" && typeof sourceRecord.data === "string") {
|
||||
const data = sourceRecord.data.trim();
|
||||
if (!data) {
|
||||
return null;
|
||||
}
|
||||
const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`;
|
||||
return {
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url,
|
||||
kind: "audio",
|
||||
label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
|
||||
mimeType: mediaType,
|
||||
...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
|
||||
},
|
||||
};
|
||||
}
|
||||
if (sourceRecord.type === "url" && typeof sourceRecord.url === "string") {
|
||||
const url = sourceRecord.url.trim();
|
||||
if (!url) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "attachment",
|
||||
attachment: {
|
||||
url,
|
||||
kind: "audio",
|
||||
label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
|
||||
mimeType: mediaType,
|
||||
...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
|
||||
},
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function mergeAdjacentTextItems(items: MessageContentItem[]): MessageContentItem[] {
|
||||
const merged: MessageContentItem[] = [];
|
||||
for (const item of items) {
|
||||
@@ -292,6 +344,14 @@ export function normalizeMessage(message: unknown): NormalizedMessage {
|
||||
}
|
||||
} else if (Array.isArray(m.content)) {
|
||||
content = m.content.flatMap((item: Record<string, unknown>) => {
|
||||
if (isAssistantMessage) {
|
||||
const audioAttachment = coerceAudioContentBlock(item);
|
||||
if (audioAttachment) {
|
||||
return [audioAttachment];
|
||||
}
|
||||
} else if (item.type === "audio") {
|
||||
return [];
|
||||
}
|
||||
if (
|
||||
item.type === "attachment" &&
|
||||
item.attachment &&
|
||||
|
||||
Reference in New Issue
Block a user