fix(gateway): show /tts audio in Control UI webchat (#61598) (thanks @neeravmakwana)

2026-04-12 01:31:08 +00:00 · 2026-04-06 08:19:38 -04:00
parent 02c092e558
commit 9aaa000da0
8 changed files with 353 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Control UI/chat: show `/tts` and other local audio-only slash replies in webchat by embedding local audio in the assistant message and rendering `<audio>` controls instead of dropping empty-text finals. Fixes #61564. (#61598) Thanks @neeravmakwana.
 - Security: preserve restrictive plugin-only tool allowlists, require owner access for `/allowlist add` and `/allowlist remove`, fail closed when `before_tool_call` hooks crash, block browser SSRF redirect bypasses earlier, and keep non-interactive auth-choice inference scoped to bundled and already-trusted plugins. (#58476, #59836, #59822, #58771, #59120) Thanks @eleqtrizit and @pgondhi987.
 - Providers/OpenAI: make GPT-5 and Codex runs act sooner with lower-verbosity defaults, visible progress during tool work, and a one-shot retry when a turn only narrates the plan instead of taking action.
 - Providers/OpenAI and reply delivery: preserve native `reasoning.effort: "none"` and strict schemas where supported, add GPT-5.4 assistant `phase` metadata across replay and the Gateway `/v1/responses` layer, and keep commentary buffered until `final_answer` so web chat, session previews, embedded replies, and Telegram partials stop leaking planning text. Fixes #59150, #59643, #61282.
--- a/extensions/diffs/src/language-hints.test.ts
+++ b/extensions/diffs/src/language-hints.test.ts
@@ -43,7 +43,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
          overflow: "wrap",
          unsafeCSS: "",
        },
-        langs: ["not-a-real-language"],
+        langs: ["not-a-real-language" as never],
        fileDiff: {
          name: "foo.txt",
          lang: "not-a-real-language" as never,
@@ -75,7 +75,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
          overflow: "scroll",
          unsafeCSS: "",
        },
-        langs: ["typescript", "not-a-real-language"],
+        langs: ["typescript", "not-a-real-language" as never],
        oldFile: {
          name: "before.unknown",
          contents: "before",
@@ -116,7 +116,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
          overflow: "wrap",
          unsafeCSS: "",
        },
-        langs: ["   "],
+        langs: ["   " as never],
        oldFile: {
          name: "before.unknown",
          contents: "before",
--- a/src/gateway/server-methods/chat-transcript-inject.ts
+++ b/src/gateway/server-methods/chat-transcript-inject.ts
@@ -16,16 +16,41 @@ export type GatewayInjectedTranscriptAppendResult = {
  error?: string;
 };

+function resolveInjectedAssistantContent(params: {
+  message: string;
+  label?: string;
+  content?: Array<Record<string, unknown>>;
+}): Array<Record<string, unknown>> {
+  const labelPrefix = params.label ? `[${params.label}]\n\n` : "";
+  if (params.content && params.content.length > 0) {
+    if (!labelPrefix) {
+      return params.content;
+    }
+    const first = params.content[0];
+    if (
+      first &&
+      typeof first === "object" &&
+      first.type === "text" &&
+      typeof first.text === "string"
+    ) {
+      return [{ ...first, text: `${labelPrefix}${first.text}` }, ...params.content.slice(1)];
+    }
+    return [{ type: "text", text: labelPrefix.trim() }, ...params.content];
+  }
+  return [{ type: "text", text: `${labelPrefix}${params.message}` }];
+}
+
 export function appendInjectedAssistantMessageToTranscript(params: {
  transcriptPath: string;
  message: string;
  label?: string;
+  /** When set, used as the assistant `content` array (e.g. text + embedded audio blocks). */
+  content?: Array<Record<string, unknown>>;
  idempotencyKey?: string;
  abortMeta?: GatewayInjectedAbortMeta;
  now?: number;
 }): GatewayInjectedTranscriptAppendResult {
  const now = params.now ?? Date.now();
-  const labelPrefix = params.label ? `[${params.label}]\n\n` : "";
  const usage = {
    input: 0,
    output: 0,
@@ -40,9 +65,18 @@ export function appendInjectedAssistantMessageToTranscript(params: {
      total: 0,
    },
  };
+  const resolvedContent = resolveInjectedAssistantContent({
+    message: params.message,
+    label: params.label,
+    content: params.content,
+  });
  const messageBody: AppendMessageArg & Record<string, unknown> = {
    role: "assistant",
-    content: [{ type: "text", text: `${labelPrefix}${params.message}` }],
+    // Gateway-injected assistant messages can include non-model content blocks (e.g. embedded TTS audio).
+    content: resolvedContent as unknown as Extract<
+      AppendMessageArg,
+      { role: "assistant" }
+    >["content"],
    timestamp: now,
    // Pi stopReason is a strict enum; this is not model output, but we still store it as a
    // normal assistant message so it participates in the session parentId chain.
--- a/src/gateway/server-methods/chat-webchat-media.test.ts
+++ b/src/gateway/server-methods/chat-webchat-media.test.ts
@@ -0,0 +1,102 @@
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { pathToFileURL } from "node:url";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
+
+describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
+  let tmpDir: string | undefined;
+
+  afterEach(() => {
+    if (tmpDir && fs.existsSync(tmpDir)) {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
+    tmpDir = undefined;
+  });
+
+  it("embeds a local audio file as a base64 gateway chat block", () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "clip.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
+
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]);
+
+    expect(blocks).toHaveLength(1);
+    const block = blocks[0] as {
+      type?: string;
+      source?: { type?: string; media_type?: string; data?: string };
+    };
+    expect(block.type).toBe("audio");
+    expect(block.source?.type).toBe("base64");
+    expect(block.source?.media_type).toBe("audio/mpeg");
+    expect(block.source?.data?.includes("data:")).toBe(false);
+    expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual(
+      Buffer.from([0xff, 0xfb, 0x90, 0x00]),
+    );
+  });
+
+  it("skips remote URLs", () => {
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([
+      { mediaUrl: "https://example.com/a.mp3" },
+    ]);
+    expect(blocks).toHaveLength(0);
+  });
+
+  it("skips non-audio local files", () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const imagePath = path.join(tmpDir, "clip.png");
+    fs.writeFileSync(imagePath, Buffer.from([0x89, 0x50, 0x4e, 0x47]));
+
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: imagePath }]);
+
+    expect(blocks).toHaveLength(0);
+  });
+
+  it("dedupes repeated paths", () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "clip.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0x00]));
+
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([
+      { mediaUrl: audioPath },
+      { mediaUrl: audioPath },
+    ]);
+    expect(blocks).toHaveLength(1);
+  });
+
+  it("embeds file:// URLs pointing at a local file", () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "clip.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0x01]));
+
+    const fileUrl = pathToFileURL(audioPath).href;
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: fileUrl }]);
+
+    expect(blocks).toHaveLength(1);
+    expect((blocks[0] as { type?: string }).type).toBe("audio");
+  });
+
+  it("does not read file contents when stat reports size over the cap", () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
+    const audioPath = path.join(tmpDir, "huge.mp3");
+    fs.writeFileSync(audioPath, Buffer.from([0x02]));
+
+    const origStat = fs.statSync.bind(fs);
+    const statSpy = vi.spyOn(fs, "statSync").mockImplementation((p: fs.PathLike) => {
+      if (String(p) === audioPath) {
+        return { isFile: () => true, size: 16 * 1024 * 1024 } as fs.Stats;
+      }
+      return origStat(p);
+    });
+    const readSpy = vi.spyOn(fs, "readFileSync");
+
+    const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]);
+
+    expect(blocks).toHaveLength(0);
+    expect(readSpy).not.toHaveBeenCalled();
+
+    statSpy.mockRestore();
+    readSpy.mockRestore();
+  });
+});
--- a/src/gateway/server-methods/chat-webchat-media.ts
+++ b/src/gateway/server-methods/chat-webchat-media.ts
@@ -0,0 +1,121 @@
+import fs from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import type { ReplyPayload } from "../../auto-reply/types.js";
+import { isAudioFileName } from "../../media/mime.js";
+import { resolveSendableOutboundReplyParts } from "../../plugin-sdk/reply-payload.js";
+
+/** Cap embedded audio size to avoid multi‑MB payloads on the chat WebSocket. */
+const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
+
+const MIME_BY_EXT: Record<string, string> = {
+  ".aac": "audio/aac",
+  ".m4a": "audio/mp4",
+  ".mp3": "audio/mpeg",
+  ".oga": "audio/ogg",
+  ".ogg": "audio/ogg",
+  ".opus": "audio/opus",
+  ".wav": "audio/wav",
+  ".webm": "audio/webm",
+};
+
+/** Map `mediaUrl` strings to an absolute filesystem path for local embedding (plain paths or `file:` URLs). */
+function resolveLocalMediaPathForEmbedding(raw: string): string | null {
+  const trimmed = raw.trim();
+  if (!trimmed) {
+    return null;
+  }
+  if (/^data:/i.test(trimmed)) {
+    return null;
+  }
+  if (/^https?:/i.test(trimmed)) {
+    return null;
+  }
+  if (trimmed.startsWith("file:")) {
+    try {
+      const p = fileURLToPath(trimmed);
+      if (!path.isAbsolute(p)) {
+        return null;
+      }
+      return p;
+    } catch {
+      return null;
+    }
+  }
+  if (!path.isAbsolute(trimmed)) {
+    return null;
+  }
+  return trimmed;
+}
+
+/** Returns a readable local file path when it is a regular file and within the size cap (single stat before read). */
+function resolveLocalAudioFileForEmbedding(raw: string): string | null {
+  const resolved = resolveLocalMediaPathForEmbedding(raw);
+  if (!resolved) {
+    return null;
+  }
+  if (!isAudioFileName(resolved)) {
+    return null;
+  }
+  try {
+    const st = fs.statSync(resolved);
+    if (!st.isFile() || st.size > MAX_WEBCHAT_AUDIO_BYTES) {
+      return null;
+    }
+    return resolved;
+  } catch {
+    return null;
+  }
+}
+
+function mimeTypeForPath(filePath: string): string {
+  const ext = path.extname(filePath).toLowerCase();
+  return MIME_BY_EXT[ext] ?? "audio/mpeg";
+}
+
+/**
+ * Build Control UI / transcript `content` blocks for local TTS (or other) audio files
+ * referenced by slash-command / agent replies when the webchat path only had text aggregation.
+ */
+export function buildWebchatAudioContentBlocksFromReplyPayloads(
+  payloads: ReplyPayload[],
+): Array<Record<string, unknown>> {
+  const seen = new Set<string>();
+  const blocks: Array<Record<string, unknown>> = [];
+  for (const payload of payloads) {
+    const parts = resolveSendableOutboundReplyParts(payload);
+    for (const raw of parts.mediaUrls) {
+      const url = raw.trim();
+      if (!url) {
+        continue;
+      }
+      const resolved = resolveLocalAudioFileForEmbedding(url);
+      if (!resolved || seen.has(resolved)) {
+        continue;
+      }
+      seen.add(resolved);
+      const block = tryReadLocalAudioContentBlock(resolved);
+      if (block) {
+        blocks.push(block);
+      }
+    }
+  }
+  return blocks;
+}
+
+function tryReadLocalAudioContentBlock(filePath: string): Record<string, unknown> | null {
+  try {
+    const buf = fs.readFileSync(filePath);
+    if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) {
+      return null;
+    }
+    const mediaType = mimeTypeForPath(filePath);
+    const base64Data = buf.toString("base64");
+    return {
+      type: "audio",
+      source: { type: "base64", media_type: mediaType, data: base64Data },
+    };
+  } catch {
+    return null;
+  }
+}
--- a/src/gateway/server-methods/chat.ts
+++ b/src/gateway/server-methods/chat.ts
@@ -74,6 +74,7 @@ import { injectTimestamp, timestampOptsFromConfig } from "./agent-timestamp.js";
 import { setGatewayDedupeEntry } from "./agent-wait-dedupe.js";
 import { normalizeRpcAttachmentsToChatAttachments } from "./attachment-normalize.js";
 import { appendInjectedAssistantMessageToTranscript } from "./chat-transcript-inject.js";
+import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
 import type {
  GatewayRequestContext,
  GatewayRequestHandlerOptions,
@@ -856,6 +857,8 @@ function transcriptHasIdempotencyKey(transcriptPath: string, idempotencyKey: str

 function appendAssistantTranscriptMessage(params: {
  message: string;
+  /** Rich Pi message blocks (text, embedded audio, etc.). Overrides plain `message` when set. */
+  content?: Array<Record<string, unknown>>;
  label?: string;
  sessionId: string;
  storePath: string | undefined;
@@ -900,6 +903,7 @@ function appendAssistantTranscriptMessage(params: {
    transcriptPath,
    message: params.message,
    label: params.label,
+    content: params.content,
    idempotencyKey: params.idempotencyKey,
    abortMeta: params.abortMeta,
  });
@@ -1788,20 +1792,33 @@ export const chatHandlers: GatewayRequestHandlers = {
                sessionKey,
              });
            } else {
-              const combinedReply = deliveredReplies
+              const finalPayloads = deliveredReplies
                .filter((entry) => entry.kind === "final")
-                .map((entry) => entry.payload)
+                .map((entry) => entry.payload);
+              const combinedReply = finalPayloads
                .map((part) => part.text?.trim() ?? "")
                .filter(Boolean)
                .join("\n\n")
                .trim();
-              let message: Record<string, unknown> | undefined;
+              const audioBlocks = buildWebchatAudioContentBlocksFromReplyPayloads(finalPayloads);
+              const assistantContent: Array<Record<string, unknown>> = [];
              if (combinedReply) {
+                assistantContent.push({ type: "text", text: combinedReply });
+              } else if (audioBlocks.length > 0) {
+                assistantContent.push({ type: "text", text: "Audio reply" });
+              }
+              assistantContent.push(...audioBlocks);
+
+              let message: Record<string, unknown> | undefined;
+              if (assistantContent.length > 0) {
                const { storePath: latestStorePath, entry: latestEntry } =
                  loadSessionEntry(sessionKey);
                const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
+                const transcriptFallbackText =
+                  combinedReply || (audioBlocks.length > 0 ? "Audio reply" : "");
                const appended = appendAssistantTranscriptMessage({
-                  message: combinedReply,
+                  message: transcriptFallbackText,
+                  content: assistantContent,
                  sessionId,
                  storePath: latestStorePath,
                  sessionFile: latestEntry?.sessionFile,
@@ -1817,7 +1834,7 @@ export const chatHandlers: GatewayRequestHandlers = {
                  const now = Date.now();
                  message = {
                    role: "assistant",
-                    content: [{ type: "text", text: combinedReply }],
+                    content: assistantContent,
                    timestamp: now,
                    // Keep this compatible with Pi stopReason enums even though this message isn't
                    // persisted to the transcript due to the append failure.
--- a/ui/src/styles/chat/layout.css
+++ b/ui/src/styles/chat/layout.css
@@ -305,6 +305,20 @@
  justify-content: flex-end;
 }

+/* Embedded audio (e.g. gateway-injected TTS from slash commands) */
+.chat-message-audio {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  margin-bottom: 8px;
+  max-width: min(420px, 100%);
+}
+
+.chat-message-audio-el {
+  width: 100%;
+  min-height: 36px;
+}
+
 /* Compose input row - horizontal layout */
 .chat-compose__row {
  display: flex;
--- a/ui/src/ui/chat/grouped-render.ts
+++ b/ui/src/ui/chat/grouped-render.ts
@@ -23,6 +23,10 @@ type ImageBlock = {
  alt?: string;
 };

+type AudioClip = {
+  url: string;
+};
+
 function extractImages(message: unknown): ImageBlock[] {
  const m = message as Record<string, unknown>;
  const content = m.content;
@@ -60,6 +64,32 @@ function extractImages(message: unknown): ImageBlock[] {
  return images;
 }

+function extractAudioClips(message: unknown): AudioClip[] {
+  const m = message as Record<string, unknown>;
+  const content = m.content;
+  const clips: AudioClip[] = [];
+  if (!Array.isArray(content)) {
+    return clips;
+  }
+  for (const block of content) {
+    if (typeof block !== "object" || block === null) {
+      continue;
+    }
+    const b = block as Record<string, unknown>;
+    if (b.type !== "audio") {
+      continue;
+    }
+    const source = b.source as Record<string, unknown> | undefined;
+    if (source?.type === "base64" && typeof source.data === "string") {
+      const data = source.data;
+      const mediaType = (source.media_type as string) || "audio/mpeg";
+      const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`;
+      clips.push({ url });
+    }
+  }
+  return clips;
+}
+
 export function renderReadingIndicatorGroup(assistant?: AssistantIdentity, basePath?: string) {
  return html`
    <div class="chat-group assistant">
@@ -580,6 +610,25 @@ function renderMessageImages(images: ImageBlock[]) {
  `;
 }

+function renderMessageAudio(clips: AudioClip[]) {
+  if (clips.length === 0) {
+    return nothing;
+  }
+  return html`
+    <div class="chat-message-audio">
+      ${clips.map(
+        (clip) =>
+          html`<audio
+            class="chat-message-audio-el"
+            controls
+            preload="metadata"
+            src=${clip.url}
+          ></audio>`,
+      )}
+    </div>
+  `;
+}
+
 /** Render tool cards inside a collapsed `<details>` element. */
 function renderCollapsedToolCards(
  toolCards: ToolCard[],
@@ -688,6 +737,8 @@ function renderGroupedMessage(
  const hasToolCards = toolCards.length > 0;
  const images = extractImages(message);
  const hasImages = images.length > 0;
+  const audioClips = extractAudioClips(message);
+  const hasAudio = audioClips.length > 0;

  const extractedText = extractTextCached(message);
  const extractedThinking =
@@ -711,7 +762,7 @@ function renderGroupedMessage(

  // Suppress empty bubbles when tool cards are the only content and toggle is off
  const visibleToolCards = hasToolCards && (opts.showToolCalls ?? true);
-  if (!markdown && !visibleToolCards && !hasImages) {
+  if (!markdown && !visibleToolCards && !hasImages && !hasAudio) {
    return nothing;
  }

@@ -747,7 +798,7 @@ function renderGroupedMessage(
                    : nothing}
              </summary>
              <div class="chat-tool-msg-body">
-                ${renderMessageImages(images)}
+                ${renderMessageImages(images)} ${renderMessageAudio(audioClips)}
                ${reasoningMarkdown
                  ? html`<div class="chat-thinking">
                      ${unsafeHTML(toSanitizedMarkdownHtml(reasoningMarkdown))}
@@ -771,7 +822,7 @@ function renderGroupedMessage(
            </details>
          `
        : html`
-            ${renderMessageImages(images)}
+            ${renderMessageImages(images)} ${renderMessageAudio(audioClips)}
            ${reasoningMarkdown
              ? html`<div class="chat-thinking">
                  ${unsafeHTML(toSanitizedMarkdownHtml(reasoningMarkdown))}