fix(webchat): render tts audio command replies

2026-05-18 16:54:46 +00:00 · 2026-05-14 19:28:42 +10:00
parent 686b93e5c7
commit 817dca5ae9
8 changed files with 212 additions and 30 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
 - Cron/Codex: default exact-command scheduled agent turns to lightweight bootstrap context so automation runs the command before loading workspace identity or memory context.
 - Codex plugin/Gateway: strip unpaired UTF-16 surrogates from Codex app-server JSON-RPC payloads and let stale reply-work recovery abort stalled reply runs, preventing malformed media turns from wedging gateway lanes.
 - Codex app server: force OAuth refresh requests to perform a real token refresh instead of reusing unchanged inherited auth-profile tokens after refresh failures. (#80738) Thanks @simplyclever914.
+- Control UI/WebChat: render `/tts audio` replies as playable audio attachments through the assistant-media ticket path, with structured-audio compatibility for older live payloads. (#81722) Thanks @Conan-Scott.
 - Bind gateway approval access to requester metadata [AI]. (#81380) Thanks @pgondhi987.
 - Telegram: let isolated polling drain independent topics, DMs, and status/control commands concurrently while preserving same-lane order. (#81849) Thanks @VACInc.
 - Doctor/Codex: stop warning that the message tool is unavailable for source-reply paths where OpenClaw grants `message` at runtime, keeping update and doctor output aligned with the OpenAI happy path. Thanks @pashpashpash.
--- a/src/gateway/control-ui-csp.test.ts
+++ b/src/gateway/control-ui-csp.test.ts
@@ -38,6 +38,12 @@ describe("buildControlUiCspHeader", () => {
    expect(csp).not.toContain("img-src 'self' data: blob: https:");
  });

+  it("allows same-origin and inline audio/video playback", () => {
+    const csp = buildControlUiCspHeader();
+    expect(csp).toContain("media-src 'self' data: blob:");
+    expect(csp).not.toContain("media-src 'self' data: blob: https:");
+  });
+
  it("includes inline script hashes in script-src when provided", () => {
    const csp = buildControlUiCspHeader({
      inlineScriptHashes: ["sha256-abc123"],
--- a/src/gateway/control-ui-csp.ts
+++ b/src/gateway/control-ui-csp.ts
@@ -45,6 +45,7 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] }
    scriptSrc,
    "style-src 'self' 'unsafe-inline' https://fonts.googleapis.com",
    "img-src 'self' data: blob:",
+    "media-src 'self' data: blob:",
    "font-src 'self' https://fonts.gstatic.com",
    "worker-src 'self'",
    "connect-src 'self' ws: wss: https://api.openai.com https://tweakcn.com",
--- a/src/gateway/server-methods/chat-webchat-media.test.ts
+++ b/src/gateway/server-methods/chat-webchat-media.test.ts
@@ -20,7 +20,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    tmpDir = undefined;
  });

-  it("embeds a local audio file as a base64 gateway chat block when it is under localRoots", async () => {
+  it("exposes a local audio file as a media-ticketed attachment when it is under localRoots", async () => {
    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
    const audioPath = path.join(tmpDir, "clip.mp3");
    fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
@@ -33,15 +33,34 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    expect(blocks).toHaveLength(1);
    const block = blocks[0] as {
      type?: string;
-      source?: { type?: string; media_type?: string; data?: string };
+      attachment?: { url?: string; kind?: string; label?: string; mimeType?: string };
    };
-    expect(block.type).toBe("audio");
-    expect(block.source?.type).toBe("base64");
-    expect(block.source?.media_type).toBe("audio/mpeg");
-    expect(block.source?.data?.includes("data:")).toBe(false);
-    expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual(
-      Buffer.from([0xff, 0xfb, 0x90, 0x00]),
+    expect(block.type).toBe("attachment");
+    expect(block.attachment).toEqual({
+      url: fs.realpathSync(audioPath),
+      kind: "audio",
+      label: "clip.mp3",
+      mimeType: "audio/mpeg",
+    });
+  });
+
+  it("preserves voice-note metadata on local audio attachments", async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "clip.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
+
+    const blocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
+      [{ mediaUrl: audioPath, trustedLocalMedia: true, audioAsVoice: true }],
+      { localRoots: [tmpDir] },
    );
+
+    expect(blocks).toHaveLength(1);
+    expect(blocks[0]).toMatchObject({
+      type: "attachment",
+      attachment: {
+        isVoiceNote: true,
+      },
+    });
  });

  it("suppresses reasoning payload audio", async () => {
@@ -113,7 +132,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    );

    expect(blocks).toHaveLength(1);
-    expect((blocks[0] as { type?: string }).type).toBe("audio");
+    expect((blocks[0] as { type?: string }).type).toBe("attachment");
  });

  it("drops tool-result file:// URLs with remote hosts before touching the filesystem", async () => {
@@ -171,7 +190,7 @@ describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
    ]);

    expect(blocks).toHaveLength(1);
-    expect((blocks[0] as { type?: string }).type).toBe("audio");
+    expect((blocks[0] as { type?: string }).type).toBe("attachment");
  });

  it("skips local audio when the opened file stat is over the cap", async () => {
--- a/src/gateway/server-methods/chat-webchat-media.ts
+++ b/src/gateway/server-methods/chat-webchat-media.ts
@@ -9,7 +9,7 @@ import { normalizeLowercaseStringOrEmpty } from "../../shared/string-coerce.js";
 import { sanitizeReplyDirectiveId } from "../../utils/directive-tags.js";
 import { isSuppressedControlReplyText } from "../control-reply-text.js";

-/** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */
+/** Cap local audio files exposed through assistant media. */
 const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
 const MAX_WEBCHAT_IMAGE_DATA_URL_CHARS = 2_000_000;
 const MAX_WEBCHAT_IMAGE_DATA_BYTES = 1_500_000;
@@ -103,18 +103,16 @@ async function readLocalAudioContentBlockForEmbedding(
    if (opened.stat.size > MAX_WEBCHAT_AUDIO_BYTES) {
      return null;
    }
-    const buf = await opened.handle.readFile();
-    if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) {
-      return null;
-    }
    return {
      path: opened.realPath,
      block: {
-        type: "audio",
-        source: {
-          type: "base64",
-          media_type: mimeTypeForPath(opened.realPath),
-          data: buf.toString("base64"),
+        type: "attachment",
+        attachment: {
+          url: opened.realPath,
+          kind: "audio",
+          label: path.basename(opened.realPath),
+          mimeType: mimeTypeForPath(opened.realPath),
+          ...(payload.audioAsVoice === true ? { isVoiceNote: true } : {}),
        },
      },
    };
--- a/src/gateway/server-methods/chat.directive-tags.test.ts
+++ b/src/gateway/server-methods/chat.directive-tags.test.ts
@@ -756,7 +756,7 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    });

    await waitForAssertion(() => {
-      const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "audio");
+      const assistantUpdate = findAssistantUpdateWithBlock((block) => block.type === "attachment");
      const message = assistantUpdate?.message as Record<string, any> | undefined;
      const content = Array.isArray(message?.content)
        ? (message.content as Array<Record<string, any>>)
@@ -764,9 +764,15 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
      expect(message?.role).toBe("assistant");
      expect(message?.idempotencyKey).toBe("idem-agent-audio:assistant-media");
      expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
-      expect(content[1]?.type).toBe("audio");
-      expect(content[1]?.source?.type).toBe("base64");
-      expect(content[1]?.source?.media_type).toBe("audio/mpeg");
+      expect(content[1]).toEqual({
+        type: "attachment",
+        attachment: {
+          url: fs.realpathSync(audioPath),
+          kind: "audio",
+          label: "reply.mp3",
+          mimeType: "audio/mpeg",
+        },
+      });
    });
  });

@@ -820,9 +826,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    expect(message?.role).toBe("assistant");
    expect(message?.idempotencyKey).toBe("idem-agent-tts:assistant-media");
    expect(content[0]).toEqual({ type: "text", text: "Audio reply" });
-    expect(content[1]?.type).toBe("audio");
-    expect(content[1]?.source?.type).toBe("base64");
-    expect(content[1]?.source?.media_type).toBe("audio/mpeg");
+    expect(content[1]).toEqual({
+      type: "attachment",
+      attachment: {
+        url: fs.realpathSync(audioPath),
+        kind: "audio",
+        label: "tts.mp3",
+        mimeType: "audio/mpeg",
+        isVoiceNote: true,
+      },
+    });
    expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain(
      "This text is already in the model transcript.",
    );
@@ -957,9 +970,16 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
    const content = getMessageContent(payload);
    expect(getMessage(payload)?.role).toBe("assistant");
    expect(content[0]).toEqual({ type: "text", text: "Command result with TTS." });
-    expect(content[1]?.type).toBe("audio");
-    expect(content[1]?.source?.type).toBe("base64");
-    expect(content[1]?.source?.media_type).toBe("audio/mpeg");
+    expect(content[1]).toEqual({
+      type: "attachment",
+      attachment: {
+        url: fs.realpathSync(audioPath),
+        kind: "audio",
+        label: "tts.mp3",
+        mimeType: "audio/mpeg",
+        isVoiceNote: true,
+      },
+    });
    const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
      (update) =>
        typeof update.message === "object" &&
--- a/ui/src/ui/chat/message-normalizer.test.ts
+++ b/ui/src/ui/chat/message-normalizer.test.ts
@@ -89,6 +89,83 @@ describe("message-normalizer", () => {
      });
    });

+    it("normalizes structured base64 audio content blocks as renderable attachments", () => {
+      const result = normalizeMessage({
+        role: "assistant",
+        content: [
+          {
+            type: "audio",
+            label: "tts.mp3",
+            source: {
+              type: "base64",
+              media_type: "audio/mpeg",
+              data: "//uQAA==",
+            },
+          },
+        ],
+      });
+
+      expect(result.content).toEqual([
+        {
+          type: "attachment",
+          attachment: {
+            url: "data:audio/mpeg;base64,//uQAA==",
+            kind: "audio",
+            label: "tts.mp3",
+            mimeType: "audio/mpeg",
+          },
+        },
+      ]);
+    });
+
+    it("normalizes structured URL audio content blocks as renderable attachments", () => {
+      const result = normalizeMessage({
+        role: "assistant",
+        content: [
+          {
+            type: "audio",
+            label: "clip.mp3",
+            source: {
+              type: "url",
+              media_type: "audio/mpeg",
+              url: "/tmp/openclaw/clip.mp3",
+            },
+          },
+        ],
+      });
+
+      expect(result.content).toEqual([
+        {
+          type: "attachment",
+          attachment: {
+            url: "/tmp/openclaw/clip.mp3",
+            kind: "audio",
+            label: "clip.mp3",
+            mimeType: "audio/mpeg",
+          },
+        },
+      ]);
+    });
+
+    it("does not normalize non-assistant structured audio blocks as attachments", () => {
+      const result = normalizeMessage({
+        role: "user",
+        content: [
+          {
+            type: "audio",
+            label: "upload.mp3",
+            source: {
+              type: "base64",
+              media_type: "audio/mpeg",
+              data: "//uQAA==",
+            },
+          },
+        ],
+      });
+
+      expect(result.content).toEqual([]);
+    });
+
    it("does not reinterpret directive-like user text blocks inside array content", () => {
      const result = normalizeMessage({
        role: "user",
--- a/ui/src/ui/chat/message-normalizer.ts
+++ b/ui/src/ui/chat/message-normalizer.ts
@@ -145,6 +145,58 @@ function inferAttachmentKind(url: string): {
  return { kind, mimeType, label };
 }

+function coerceAudioContentBlock(
+  item: Record<string, unknown>,
+): Extract<MessageContentItem, { type: "attachment" }> | null {
+  if (item.type !== "audio") {
+    return null;
+  }
+  const source = item.source;
+  if (!source || typeof source !== "object" || Array.isArray(source)) {
+    return null;
+  }
+  const sourceRecord = source as Record<string, unknown>;
+  const mediaType =
+    typeof sourceRecord.media_type === "string" &&
+    sourceRecord.media_type.trim().toLowerCase().startsWith("audio/")
+      ? sourceRecord.media_type.trim()
+      : "audio/mpeg";
+  if (sourceRecord.type === "base64" && typeof sourceRecord.data === "string") {
+    const data = sourceRecord.data.trim();
+    if (!data) {
+      return null;
+    }
+    const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`;
+    return {
+      type: "attachment",
+      attachment: {
+        url,
+        kind: "audio",
+        label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
+        mimeType: mediaType,
+        ...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
+      },
+    };
+  }
+  if (sourceRecord.type === "url" && typeof sourceRecord.url === "string") {
+    const url = sourceRecord.url.trim();
+    if (!url) {
+      return null;
+    }
+    return {
+      type: "attachment",
+      attachment: {
+        url,
+        kind: "audio",
+        label: typeof item.label === "string" && item.label.trim() ? item.label.trim() : "Audio",
+        mimeType: mediaType,
+        ...(item.isVoiceNote === true ? { isVoiceNote: true } : {}),
+      },
+    };
+  }
+  return null;
+}
+
 function mergeAdjacentTextItems(items: MessageContentItem[]): MessageContentItem[] {
  const merged: MessageContentItem[] = [];
  for (const item of items) {
@@ -292,6 +344,14 @@ export function normalizeMessage(message: unknown): NormalizedMessage {
    }
  } else if (Array.isArray(m.content)) {
    content = m.content.flatMap((item: Record<string, unknown>) => {
+      if (isAssistantMessage) {
+        const audioAttachment = coerceAudioContentBlock(item);
+        if (audioAttachment) {
+          return [audioAttachment];
+        }
+      } else if (item.type === "audio") {
+        return [];
+      }
      if (
        item.type === "attachment" &&
        item.attachment &&