fix(webchat): render tts audio command replies

This commit is contained in:
Conan Scott
2026-05-14 19:28:42 +10:00
committed by Peter Steinberger
parent 686b93e5c7
commit 817dca5ae9
8 changed files with 212 additions and 30 deletions

View File

@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
- Cron/Codex: default exact-command scheduled agent turns to lightweight bootstrap context so automation runs the command before loading workspace identity or memory context.
- Codex plugin/Gateway: strip unpaired UTF-16 surrogates from Codex app-server JSON-RPC payloads and let stale reply-work recovery abort stalled reply runs, preventing malformed media turns from wedging gateway lanes.
- Codex app server: force OAuth refresh requests to perform a real token refresh instead of reusing unchanged inherited auth-profile tokens after refresh failures. (#80738) Thanks @simplyclever914.
- Control UI/WebChat: render `/tts audio` replies as playable audio attachments through the assistant-media ticket path, with structured-audio compatibility for older live payloads. (#81722) Thanks @Conan-Scott.
- Bind gateway approval access to requester metadata [AI]. (#81380) Thanks @pgondhi987.
- Telegram: let isolated polling drain independent topics, DMs, and status/control commands concurrently while preserving same-lane order. (#81849) Thanks @VACInc.
- Doctor/Codex: stop warning that the message tool is unavailable for source-reply paths where OpenClaw grants `message` at runtime, keeping update and doctor output aligned with the OpenAI happy path. Thanks @pashpashpash.

View File

@@ -38,6 +38,12 @@ describe("buildControlUiCspHeader", () => {
expect(csp).not.toContain("img-src 'self' data: blob: https:");
});
it("allows same-origin and inline audio/video playback", () => {
const csp = buildControlUiCspHeader();
expect(csp).toContain("media-src 'self' data: blob:");
expect(csp).not.toContain("media-src 'self' data: blob: https:");
});
it("includes inline script hashes in script-src when provided", () => {
const csp = buildControlUiCspHeader({
inlineScriptHashes: ["sha256-abc123"],

View File

@@ -45,6 +45,7 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] }
scriptSrc,
"style-src 'self' 'unsafe-inline' https://fonts.googleapis.com",
"img-src 'self' data: blob:",
"media-src 'self' data: blob:",
"font-src 'self' https://fonts.gstatic.com",
"worker-src 'self'",
"connect-src 'self' ws: wss: https://api.openai.com https://tweakcn.com",

View File

@@ -20,7 +20,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
tmpDir = undefined;
});
it("embeds a local audio file as a base64 gateway chat block when it is under localRoots", async () => {
it("exposes a local audio file as a media-ticketed attachment when it is under localRoots", async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "clip.mp3");
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
@@ -33,15 +33,34 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
expect(blocks).toHaveLength(1);
const block = blocks[0] as {
type?: string;
source?: { type?: string; media_type?: string; data?: string };
attachment?: { url?: string; kind?: string; label?: string; mimeType?: string };
};
expect(block.type).toBe("audio");
expect(block.source?.type).toBe("base64");
expect(block.source?.media_type).toBe("audio/mpeg");
expect(block.source?.data?.includes("data:")).toBe(false);
expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual(
Buffer.from([0xff, 0xfb, 0x90, 0x00]),
expect(block.type).toBe("attachment");
expect(block.attachment).toEqual({
url: fs.realpathSync(audioPath),
kind: "audio",
label: "clip.mp3",
mimeType: "audio/mpeg",
});
});
it("preserves voice-note metadata on local audio attachments", async () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "clip.mp3");
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
[{ mediaUrl: audioPath, trustedLocalMedia: true, audioAsVoice: true }],
{ localRoots: [tmpDir] },
);
expect(blocks).toHaveLength(1);
expect(blocks[0]).toMatchObject({
type: "attachment",
attachment: {
isVoiceNote: true,
},
});
});
it("suppresses reasoning payload audio", async () => {
@@ -113,7 +132,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
);
expect(blocks).toHaveLength(1);
expect((blocks[0] as { type?: string }).type).toBe("audio");
expect((blocks[0] as { type?: string }).type).toBe("attachment");
});
it("drops tool-result file:// URLs with remote hosts before touching the filesystem", async () => {
@@ -171,7 +190,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
]);
expect(blocks).toHaveLength(1);
expect((blocks[0] as { type?: string }).type).toBe("audio");
expect((blocks[0] as { type?: string }).type).toBe("attachment");
});
it("skips local audio when the opened file stat is over the cap", async () => {

View File

@@ -9,7 +9,7 @@ import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js";
import { isSuppressedControlReplyText } from "../control-reply-text.js";
/** Cap embedded audio size to avoid multiMB payloads on the chat WebSocket. */
/** Cap local audio files exposed through assistant media. */
const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000;
const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000;
@@ -103,18 +103,16 @@ async function readLocalAudioContentBlockForEmbedding(
if (opened.stat.size > MAX_WEBCHAT_AUDIO_BYTES) {
return null;
}
const buf = await opened.handle.readFile();
if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) {
return null;
}
return {
path: opened.realPath,
block: {
type: "audio",
source: {
type: "base64",
media_type: mimeTypeForPath(opened.realPath),
data: buf.toString("base64"),
type: "attachment",
attachment: {
url: opened.realPath,
kind: "audio",
label: path.basename(opened.realPath),
mimeType: mimeTypeForPath(opened.realPath),
...(payload.audioAsVoice === true ? { isVoiceNote: true } : {}),
},
},
};

View File

@@ -756,7 +756,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
await waitForAssertion(() => {
const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "audio");
const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "attachment");
const message = assistantUpdate?.message as Record<string, any> | undefined;
const content = Array.isArray(message?.content)
? (message.content as Array<Record<string, any>>)
@@ -764,9 +764,15 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(message?.role).toBe("assistant");
expect(message?.idempotencyKey).toBe("idem-agent-audio:assistant-media");
expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
expect(content[1]?.type).toBe("audio");
expect(content[1]?.source?.type).toBe("base64");
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
expect(content[1]).toEqual({
type: "attachment",
attachment: {
url: fs.realpathSync(audioPath),
kind: "audio",
label: "reply.mp3",
mimeType: "audio/mpeg",
},
});
});
});
@@ -820,9 +826,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
expect(message?.role).toBe("assistant");
expect(message?.idempotencyKey).toBe("idem-agent-tts:assistant-media");
expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
expect(content[1]?.type).toBe("audio");
expect(content[1]?.source?.type).toBe("base64");
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
expect(content[1]).toEqual({
type: "attachment",
attachment: {
url: fs.realpathSync(audioPath),
kind: "audio",
label: "tts.mp3",
mimeType: "audio/mpeg",
isVoiceNote: true,
},
});
expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain(
"This text is already in the model transcript.",
);
@@ -957,9 +970,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
const content = getMessageContent(payload);
expect(getMessage(payload)?.role).toBe("assistant");
expect(content[0]).toEqual({ type: "text", text: "Command result with TTS." });
expect(content[1]?.type).toBe("audio");
expect(content[1]?.source?.type).toBe("base64");
expect(content[1]?.source?.media_type).toBe("audio/mpeg");
expect(content[1]).toEqual({
type: "attachment",
attachment: {
url: fs.realpathSync(audioPath),
kind: "audio",
label: "tts.mp3",
mimeType: "audio/mpeg",
isVoiceNote: true,
},
});
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
(update) =>
typeof update.message === "object" &&

View File

@@ -89,6 +89,83 @@ describe("message-normalizer", () => {
});
});
it("normalizes structured base64 audio content blocks as renderable attachments", () => {
const result = normalizeMessage({
role: "assistant",
content: [
{
type: "audio",
label: "tts.mp3",
source: {
type: "base64",
media_type: "audio/mpeg",
data: "//uQAA==",
},
},
],
});
expect(result.content).toEqual([
{
type: "attachment",
attachment: {
url: "data:audio/mpeg;base64,//uQAA==",
kind: "audio",
label: "tts.mp3",
mimeType: "audio/mpeg",
},
},
]);
});
it("normalizes structured URL audio content blocks as renderable attachments", () => {
const result = normalizeMessage({
role: "assistant",
content: [
{
type: "audio",
label: "clip.mp3",
source: {
type: "url",
media_type: "audio/mpeg",
url: "/tmp/openclaw/clip.mp3",
},
},
],
});
expect(result.content).toEqual([
{
type: "attachment",
attachment: {
url: "/tmp/openclaw/clip.mp3",
kind: "audio",
label: "clip.mp3",
mimeType: "audio/mpeg",
},
},
]);
});
it("does not normalize non-assistant structured audio blocks as attachments", () => {
const result = normalizeMessage({
role: "user",
content: [
{
type: "audio",
label: "upload.mp3",
source: {
type: "base64",
media_type: "audio/mpeg",
data: "//uQAA==",
},
},
],
});
expect(result.content).toEqual([]);
});
it("does not reinterpret directive-like user text blocks inside array content", () => {
const result = normalizeMessage({
role: "user",

View File

@@ -145,6 +145,58 @@ function inferAttachmentKind(url: string): {
return { kind, mimeType, label };
}
function coerceAudioContentBlock(
item: Record<string, unknown>,
): Extract<MessageContentItem, { type: "attachment" }> | null {
if (item.type !== "audio") {
return null;
}
const source = item.source;
if (!source || typeof source !== "object" || Array.isArray(source)) {
return null;
}
const sourceRecord = source as Record<string, unknown>;
const mediaType =
typeof sourceRecord.media_type === "string" &&
sourceRecord.media_type.trim().toLowerCase().startsWith("audio/")
? sourceRecord.media_type.trim()
: "audio/mpeg";
if (sourceRecord.type === "base64" && typeof sourceRecord.data === "string") {
const data = sourceRecord.data.trim();
if (!data) {
return null;
}
const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`;
return {
type: "attachment",
attachment: {
url,
kind: "audio",
label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
mimeType: mediaType,
...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
},
};
}
if (sourceRecord.type === "url" && typeof sourceRecord.url === "string") {
const url = sourceRecord.url.trim();
if (!url) {
return null;
}
return {
type: "attachment",
attachment: {
url,
kind: "audio",
label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
mimeType: mediaType,
...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
},
};
}
return null;
}
function mergeAdjacentTextItems(items: MessageContentItem[]): MessageContentItem[] {
const merged: MessageContentItem[] = [];
for (const item of items) {
@@ -292,6 +344,14 @@ export function normalizeMessage(message: unknown): NormalizedMessage {
}
} else if (Array.isArray(m.content)) {
content = m.content.flatMap((item: Record<string, unknown>) => {
if (isAssistantMessage) {
const audioAttachment = coerceAudioContentBlock(item);
if (audioAttachment) {
return [audioAttachment];
}
} else if (item.type === "audio") {
return [];
}
if (
item.type === "attachment" &&
item.attachment &&