diff --git a/CHANGELOG.md b/CHANGELOG.md index 35b33c6f211..6499c9994f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai - Cron/Codex: default exact-command scheduled agent turns to lightweight bootstrap context so automation runs the command before loading workspace identity or memory context. - Codex plugin/Gateway: strip unpaired UTF-16 surrogates from Codex app-server JSON-RPC payloads and let stale reply-work recovery abort stalled reply runs, preventing malformed media turns from wedging gateway lanes. - Codex app server: force OAuth refresh requests to perform a real token refresh instead of reusing unchanged inherited auth-profile tokens after refresh failures. (#80738) Thanks @simplyclever914. +- Control UI/WebChat: render `/tts audio` replies as playable audio attachments through the assistant-media ticket path, with structured-audio compatibility for older live payloads. (#81722) Thanks @Conan-Scott. - Bind gateway approval access to requester metadata [AI]. (#81380) Thanks @pgondhi987. - Telegram: let isolated polling drain independent topics, DMs, and status/control commands concurrently while preserving same-lane order. (#81849) Thanks @VACInc. - Doctor/Codex: stop warning that the message tool is unavailable for source-reply paths where OpenClaw grants `message` at runtime, keeping update and doctor output aligned with the OpenAI happy path. Thanks @pashpashpash. diff --git a/src/gateway/control-ui-csp.test.ts b/src/gateway/control-ui-csp.test.ts index c2bad96f2ee..39eb74f1fa9 100644 --- a/src/gateway/control-ui-csp.test.ts +++ b/src/gateway/control-ui-csp.test.ts @@ -38,6 +38,12 @@ describe("buildControlUiCspHeader", () => { expect(csp).not.toContain("img-src 'self' data: blob: https:"); }); + it("allows same-origin and inline audio/video playback", () => { + const csp = buildControlUiCspHeader(); + expect(csp).toContain("media-src 'self' data: blob:"); + expect(csp).not.toContain("media-src 'self' data: blob: https:"); + }); + it("includes inline script hashes in script-src when provided", () => { const csp = buildControlUiCspHeader({ inlineScriptHashes: ["sha256-abc123"], diff --git a/src/gateway/control-ui-csp.ts b/src/gateway/control-ui-csp.ts index e7fbf08425c..1b116005054 100644 --- a/src/gateway/control-ui-csp.ts +++ b/src/gateway/control-ui-csp.ts @@ -45,6 +45,7 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] } scriptSrc, "style-src 'self' 'unsafe-inline' https://fonts.googleapis.com", "img-src 'self' data: blob:", + "media-src 'self' data: blob:", "font-src 'self' https://fonts.gstatic.com", "worker-src 'self'", "connect-src 'self' ws: wss: https://api.openai.com https://tweakcn.com", diff --git a/src/gateway/server-methods/chat-webchat-media.test.ts b/src/gateway/server-methods/chat-webchat-media.test.ts index c83cd97bcbd..a7b97015b20 100644 --- a/src/gateway/server-methods/chat-webchat-media.test.ts +++ b/src/gateway/server-methods/chat-webchat-media.test.ts @@ -20,7 +20,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { tmpDir = undefined; }); - it("embeds a local audio file as a base64 gateway chat block when it is under localRoots", async () => { + it("exposes a local audio file as a media-ticketed attachment when it is under localRoots", async () => { tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-")); const audioPath = path.join(tmpDir, "clip.mp3"); fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); @@ -33,15 +33,34 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { expect(blocks).toHaveLength(1); const block = blocks[0] as { type?: string; - source?: { type?: string; media_type?: string; data?: string }; + attachment?: { url?: string; kind?: string; label?: string; mimeType?: string }; }; - expect(block.type).toBe("audio"); - expect(block.source?.type).toBe("base64"); - expect(block.source?.media_type).toBe("audio/mpeg"); - expect(block.source?.data?.includes("data:")).toBe(false); - expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual( - Buffer.from([0xff, 0xfb, 0x90, 0x00]), + expect(block.type).toBe("attachment"); + expect(block.attachment).toEqual({ + url: fs.realpathSync(audioPath), + kind: "audio", + label: "clip.mp3", + mimeType: "audio/mpeg", + }); + }); + + it("preserves voice-note metadata on local audio attachments", async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-")); + const audioPath = path.join(tmpDir, "clip.mp3"); + fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + + const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads( + [{ mediaUrl: audioPath, trustedLocalMedia: true, audioAsVoice: true }], + { localRoots: [tmpDir] }, ); + + expect(blocks).toHaveLength(1); + expect(blocks[0]).toMatchObject({ + type: "attachment", + attachment: { + isVoiceNote: true, + }, + }); }); it("suppresses reasoning payload audio", async () => { @@ -113,7 +132,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { ); expect(blocks).toHaveLength(1); - expect((blocks[0] as { type?: string }).type).toBe("audio"); + expect((blocks[0] as { type?: string }).type).toBe("attachment"); }); it("drops tool-result file:// URLs with remote hosts before touching the filesystem", async () => { @@ -171,7 +190,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => { ]); expect(blocks).toHaveLength(1); - expect((blocks[0] as { type?: string }).type).toBe("audio"); + expect((blocks[0] as { type?: string }).type).toBe("attachment"); }); it("skips local audio when the opened file stat is over the cap", async () => { diff --git a/src/gateway/server-methods/chat-webchat-media.ts b/src/gateway/server-methods/chat-webchat-media.ts index cb9e5c27f36..052807dabd6 100644 --- a/src/gateway/server-methods/chat-webchat-media.ts +++ b/src/gateway/server-methods/chat-webchat-media.ts @@ -9,7 +9,7 @@ import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js"; import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js"; import { isSuppressedControlReplyText } from "../control-reply-text.js"; -/** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */ +/** Cap local audio files exposed through assistant media. */ const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024; const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000; const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000; @@ -103,18 +103,16 @@ async function readLocalAudioContentBlockForEmbedding( if (opened.stat.size > MAX_WEBCHAT_AUDIO_BYTES) { return null; } - const buf = await opened.handle.readFile(); - if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) { - return null; - } return { path: opened.realPath, block: { - type: "audio", - source: { - type: "base64", - media_type: mimeTypeForPath(opened.realPath), - data: buf.toString("base64"), + type: "attachment", + attachment: { + url: opened.realPath, + kind: "audio", + label: path.basename(opened.realPath), + mimeType: mimeTypeForPath(opened.realPath), + ...(payload.audioAsVoice === true ? { isVoiceNote: true } : {}), }, }, }; diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index aca7afebc4f..d9a1fa11841 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -756,7 +756,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () => }); await waitForAssertion(() => { - const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "audio"); + const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "attachment"); const message = assistantUpdate?.message as Record | undefined; const content = Array.isArray(message?.content) ? (message.content as Array>) @@ -764,9 +764,15 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(message?.role).toBe("assistant"); expect(message?.idempotencyKey).toBe("idem-agent-audio:assistant-media"); expect(content[0]).toEqual({ type: "text", text: "Audio reply" }); - expect(content[1]?.type).toBe("audio"); - expect(content[1]?.source?.type).toBe("base64"); - expect(content[1]?.source?.media_type).toBe("audio/mpeg"); + expect(content[1]).toEqual({ + type: "attachment", + attachment: { + url: fs.realpathSync(audioPath), + kind: "audio", + label: "reply.mp3", + mimeType: "audio/mpeg", + }, + }); }); }); @@ -820,9 +826,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () => expect(message?.role).toBe("assistant"); expect(message?.idempotencyKey).toBe("idem-agent-tts:assistant-media"); expect(content[0]).toEqual({ type: "text", text: "Audio reply" }); - expect(content[1]?.type).toBe("audio"); - expect(content[1]?.source?.type).toBe("base64"); - expect(content[1]?.source?.media_type).toBe("audio/mpeg"); + expect(content[1]).toEqual({ + type: "attachment", + attachment: { + url: fs.realpathSync(audioPath), + kind: "audio", + label: "tts.mp3", + mimeType: "audio/mpeg", + isVoiceNote: true, + }, + }); expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain( "This text is already in the model transcript.", ); @@ -957,9 +970,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () => const content = getMessageContent(payload); expect(getMessage(payload)?.role).toBe("assistant"); expect(content[0]).toEqual({ type: "text", text: "Command result with TTS." }); - expect(content[1]?.type).toBe("audio"); - expect(content[1]?.source?.type).toBe("base64"); - expect(content[1]?.source?.media_type).toBe("audio/mpeg"); + expect(content[1]).toEqual({ + type: "attachment", + attachment: { + url: fs.realpathSync(audioPath), + kind: "audio", + label: "tts.mp3", + mimeType: "audio/mpeg", + isVoiceNote: true, + }, + }); const assistantUpdates = mockState.emittedTranscriptUpdates.filter( (update) => typeof update.message === "object" && diff --git a/ui/src/ui/chat/message-normalizer.test.ts b/ui/src/ui/chat/message-normalizer.test.ts index 35fa57341ab..86633014dcd 100644 --- a/ui/src/ui/chat/message-normalizer.test.ts +++ b/ui/src/ui/chat/message-normalizer.test.ts @@ -89,6 +89,83 @@ describe("message-normalizer", () => { }); }); + it("normalizes structured base64 audio content blocks as renderable attachments", () => { + const result = normalizeMessage({ + role: "assistant", + content: [ + { + type: "audio", + label: "tts.mp3", + source: { + type: "base64", + media_type: "audio/mpeg", + data: "//uQAA==", + }, + }, + ], + }); + + expect(result.content).toEqual([ + { + type: "attachment", + attachment: { + url: "data:audio/mpeg;base64,//uQAA==", + kind: "audio", + label: "tts.mp3", + mimeType: "audio/mpeg", + }, + }, + ]); + }); + + it("normalizes structured URL audio content blocks as renderable attachments", () => { + const result = normalizeMessage({ + role: "assistant", + content: [ + { + type: "audio", + label: "clip.mp3", + source: { + type: "url", + media_type: "audio/mpeg", + url: "/tmp/openclaw/clip.mp3", + }, + }, + ], + }); + + expect(result.content).toEqual([ + { + type: "attachment", + attachment: { + url: "/tmp/openclaw/clip.mp3", + kind: "audio", + label: "clip.mp3", + mimeType: "audio/mpeg", + }, + }, + ]); + }); + + it("does not normalize non-assistant structured audio blocks as attachments", () => { + const result = normalizeMessage({ + role: "user", + content: [ + { + type: "audio", + label: "upload.mp3", + source: { + type: "base64", + media_type: "audio/mpeg", + data: "//uQAA==", + }, + }, + ], + }); + + expect(result.content).toEqual([]); + }); + it("does not reinterpret directive-like user text blocks inside array content", () => { const result = normalizeMessage({ role: "user", diff --git a/ui/src/ui/chat/message-normalizer.ts b/ui/src/ui/chat/message-normalizer.ts index 85181147753..50447e6fc0d 100644 --- a/ui/src/ui/chat/message-normalizer.ts +++ b/ui/src/ui/chat/message-normalizer.ts @@ -145,6 +145,58 @@ function inferAttachmentKind(url: string): { return { kind, mimeType, label }; } +function coerceAudioContentBlock( + item: Record, +): Extract | null { + if (item.type !== "audio") { + return null; + } + const source = item.source; + if (!source || typeof source !== "object" || Array.isArray(source)) { + return null; + } + const sourceRecord = source as Record; + const mediaType = + typeof sourceRecord.media_type === "string" && + sourceRecord.media_type.trim().toLowerCase().startsWith("audio/") + ? sourceRecord.media_type.trim() + : "audio/mpeg"; + if (sourceRecord.type === "base64" && typeof sourceRecord.data === "string") { + const data = sourceRecord.data.trim(); + if (!data) { + return null; + } + const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`; + return { + type: "attachment", + attachment: { + url, + kind: "audio", + label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio", + mimeType: mediaType, + ...(item.isVoiceNote === true ? { isVoiceNote: true } : {}), + }, + }; + } + if (sourceRecord.type === "url" && typeof sourceRecord.url === "string") { + const url = sourceRecord.url.trim(); + if (!url) { + return null; + } + return { + type: "attachment", + attachment: { + url, + kind: "audio", + label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio", + mimeType: mediaType, + ...(item.isVoiceNote === true ? { isVoiceNote: true } : {}), + }, + }; + } + return null; +} + function mergeAdjacentTextItems(items: MessageContentItem[]): MessageContentItem[] { const merged: MessageContentItem[] = []; for (const item of items) { @@ -292,6 +344,14 @@ export function normalizeMessage(message: unknown): NormalizedMessage { } } else if (Array.isArray(m.content)) { content = m.content.flatMap((item: Record) => { + if (isAssistantMessage) { + const audioAttachment = coerceAudioContentBlock(item); + if (audioAttachment) { + return [audioAttachment]; + } + } else if (item.type === "audio") { + return []; + } if ( item.type === "attachment" && item.attachment &&