fix(gateway): show /tts audio in Control UI webchat (#61598) (thanks @neeravmakwana)

This commit is contained in:
Neerav Makwana
2026-04-06 08:19:38 -04:00
committed by GitHub
parent 02c092e558
commit 9aaa000da0
8 changed files with 353 additions and 13 deletions

View File

@@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Control UI/chat: show `/tts` and other local audio-only slash replies in webchat by embedding local audio in the assistant message and rendering `<audio>` controls instead of dropping empty-text finals. Fixes #61564. (#61598) Thanks @neeravmakwana.
- Security: preserve restrictive plugin-only tool allowlists, require owner access for `/allowlist add` and `/allowlist remove`, fail closed when `before_tool_call` hooks crash, block browser SSRF redirect bypasses earlier, and keep non-interactive auth-choice inference scoped to bundled and already-trusted plugins. (#58476, #59836, #59822, #58771, #59120) Thanks @eleqtrizit and @pgondhi987.
- Providers/OpenAI: make GPT-5 and Codex runs act sooner with lower-verbosity defaults, visible progress during tool work, and a one-shot retry when a turn only narrates the plan instead of taking action.
- Providers/OpenAI and reply delivery: preserve native `reasoning.effort: "none"` and strict schemas where supported, add GPT-5.4 assistant `phase` metadata across replay and the Gateway `/v1/responses` layer, and keep commentary buffered until `final_answer` so web chat, session previews, embedded replies, and Telegram partials stop leaking planning text. Fixes #59150, #59643, #61282.

View File

@@ -43,7 +43,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
overflow: "wrap",
unsafeCSS: "",
},
langs: ["not-a-real-language"],
langs: ["not-a-real-language" as never],
fileDiff: {
name: "foo.txt",
lang: "not-a-real-language" as never,
@@ -75,7 +75,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
overflow: "scroll",
unsafeCSS: "",
},
langs: ["typescript", "not-a-real-language"],
langs: ["typescript", "not-a-real-language" as never],
oldFile: {
name: "before.unknown",
contents: "before",
@@ -116,7 +116,7 @@ describe("normalizeDiffViewerPayloadLanguages", () => {
overflow: "wrap",
unsafeCSS: "",
},
langs: [" "],
langs: [" " as never],
oldFile: {
name: "before.unknown",
contents: "before",

View File

@@ -16,16 +16,41 @@ export type GatewayInjectedTranscriptAppendResult = {
error?: string;
};
function resolveInjectedAssistantContent(params: {
message: string;
label?: string;
content?: Array<Record<string, unknown>>;
}): Array<Record<string, unknown>> {
const labelPrefix = params.label ? `[${params.label}]\n\n` : "";
if (params.content && params.content.length > 0) {
if (!labelPrefix) {
return params.content;
}
const first = params.content[0];
if (
first &&
typeof first === "object" &&
first.type === "text" &&
typeof first.text === "string"
) {
return [{ ...first, text: `${labelPrefix}${first.text}` }, ...params.content.slice(1)];
}
return [{ type: "text", text: labelPrefix.trim() }, ...params.content];
}
return [{ type: "text", text: `${labelPrefix}${params.message}` }];
}
export function appendInjectedAssistantMessageToTranscript(params: {
transcriptPath: string;
message: string;
label?: string;
/** When set, used as the assistant `content` array (e.g. text + embedded audio blocks). */
content?: Array<Record<string, unknown>>;
idempotencyKey?: string;
abortMeta?: GatewayInjectedAbortMeta;
now?: number;
}): GatewayInjectedTranscriptAppendResult {
const now = params.now ?? Date.now();
const labelPrefix = params.label ? `[${params.label}]\n\n` : "";
const usage = {
input: 0,
output: 0,
@@ -40,9 +65,18 @@ export function appendInjectedAssistantMessageToTranscript(params: {
total: 0,
},
};
const resolvedContent = resolveInjectedAssistantContent({
message: params.message,
label: params.label,
content: params.content,
});
const messageBody: AppendMessageArg & Record<string, unknown> = {
role: "assistant",
content: [{ type: "text", text: `${labelPrefix}${params.message}` }],
// Gateway-injected assistant messages can include non-model content blocks (e.g. embedded TTS audio).
content: resolvedContent as unknown as Extract<
AppendMessageArg,
{ role: "assistant" }
>["content"],
timestamp: now,
// Pi stopReason is a strict enum; this is not model output, but we still store it as a
// normal assistant message so it participates in the session parentId chain.

View File

@@ -0,0 +1,102 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { pathToFileURL } from "node:url";
import { afterEach, describe, expect, it, vi } from "vitest";
import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
describe("buildWebchatAudioContentBlocksFromReplyPayloads", () => {
let tmpDir: string | undefined;
afterEach(() => {
if (tmpDir && fs.existsSync(tmpDir)) {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
tmpDir = undefined;
});
it("embeds a local audio file as a base64 gateway chat block", () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "clip.mp3");
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]);
expect(blocks).toHaveLength(1);
const block = blocks[0] as {
type?: string;
source?: { type?: string; media_type?: string; data?: string };
};
expect(block.type).toBe("audio");
expect(block.source?.type).toBe("base64");
expect(block.source?.media_type).toBe("audio/mpeg");
expect(block.source?.data?.includes("data:")).toBe(false);
expect(Buffer.from(block.source?.data ?? "", "base64")).toEqual(
Buffer.from([0xff, 0xfb, 0x90, 0x00]),
);
});
it("skips remote URLs", () => {
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([
{ mediaUrl: "https://example.com/a.mp3" },
]);
expect(blocks).toHaveLength(0);
});
it("skips non-audio local files", () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const imagePath = path.join(tmpDir, "clip.png");
fs.writeFileSync(imagePath, Buffer.from([0x89, 0x50, 0x4e, 0x47]));
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: imagePath }]);
expect(blocks).toHaveLength(0);
});
it("dedupes repeated paths", () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "clip.mp3");
fs.writeFileSync(audioPath, Buffer.from([0x00]));
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([
{ mediaUrl: audioPath },
{ mediaUrl: audioPath },
]);
expect(blocks).toHaveLength(1);
});
it("embeds file:// URLs pointing at a local file", () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "clip.mp3");
fs.writeFileSync(audioPath, Buffer.from([0x01]));
const fileUrl = pathToFileURL(audioPath).href;
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: fileUrl }]);
expect(blocks).toHaveLength(1);
expect((blocks[0] as { type?: string }).type).toBe("audio");
});
it("does not read file contents when stat reports size over the cap", () => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-webchat-audio-"));
const audioPath = path.join(tmpDir, "huge.mp3");
fs.writeFileSync(audioPath, Buffer.from([0x02]));
const origStat = fs.statSync.bind(fs);
const statSpy = vi.spyOn(fs, "statSync").mockImplementation((p: fs.PathLike) => {
if (String(p) === audioPath) {
return { isFile: () => true, size: 16 * 1024 * 1024 } as fs.Stats;
}
return origStat(p);
});
const readSpy = vi.spyOn(fs, "readFileSync");
const blocks = buildWebchatAudioContentBlocksFromReplyPayloads([{ mediaUrl: audioPath }]);
expect(blocks).toHaveLength(0);
expect(readSpy).not.toHaveBeenCalled();
statSpy.mockRestore();
readSpy.mockRestore();
});
});

View File

@@ -0,0 +1,121 @@
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
import type { ReplyPayload } from "../../auto-reply/types.js";
import { isAudioFileName } from "../../media/mime.js";
import { resolveSendableOutboundReplyParts } from "../../plugin-sdk/reply-payload.js";
/** Cap embedded audio size to avoid multiMB payloads on the chat WebSocket. */
const MAX_WEBCHAT_AUDIO_BYTES = 15 * 1024 * 1024;
const MIME_BY_EXT: Record<string, string> = {
".aac": "audio/aac",
".m4a": "audio/mp4",
".mp3": "audio/mpeg",
".oga": "audio/ogg",
".ogg": "audio/ogg",
".opus": "audio/opus",
".wav": "audio/wav",
".webm": "audio/webm",
};
/** Map `mediaUrl` strings to an absolute filesystem path for local embedding (plain paths or `file:` URLs). */
function resolveLocalMediaPathForEmbedding(raw: string): string | null {
const trimmed = raw.trim();
if (!trimmed) {
return null;
}
if (/^data:/i.test(trimmed)) {
return null;
}
if (/^https?:/i.test(trimmed)) {
return null;
}
if (trimmed.startsWith("file:")) {
try {
const p = fileURLToPath(trimmed);
if (!path.isAbsolute(p)) {
return null;
}
return p;
} catch {
return null;
}
}
if (!path.isAbsolute(trimmed)) {
return null;
}
return trimmed;
}
/** Returns a readable local file path when it is a regular file and within the size cap (single stat before read). */
function resolveLocalAudioFileForEmbedding(raw: string): string | null {
const resolved = resolveLocalMediaPathForEmbedding(raw);
if (!resolved) {
return null;
}
if (!isAudioFileName(resolved)) {
return null;
}
try {
const st = fs.statSync(resolved);
if (!st.isFile() || st.size > MAX_WEBCHAT_AUDIO_BYTES) {
return null;
}
return resolved;
} catch {
return null;
}
}
function mimeTypeForPath(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
return MIME_BY_EXT[ext] ?? "audio/mpeg";
}
/**
* Build Control UI / transcript `content` blocks for local TTS (or other) audio files
* referenced by slash-command / agent replies when the webchat path only had text aggregation.
*/
export function buildWebchatAudioContentBlocksFromReplyPayloads(
payloads: ReplyPayload[],
): Array<Record<string, unknown>> {
const seen = new Set<string>();
const blocks: Array<Record<string, unknown>> = [];
for (const payload of payloads) {
const parts = resolveSendableOutboundReplyParts(payload);
for (const raw of parts.mediaUrls) {
const url = raw.trim();
if (!url) {
continue;
}
const resolved = resolveLocalAudioFileForEmbedding(url);
if (!resolved || seen.has(resolved)) {
continue;
}
seen.add(resolved);
const block = tryReadLocalAudioContentBlock(resolved);
if (block) {
blocks.push(block);
}
}
}
return blocks;
}
function tryReadLocalAudioContentBlock(filePath: string): Record<string, unknown> | null {
try {
const buf = fs.readFileSync(filePath);
if (buf.length > MAX_WEBCHAT_AUDIO_BYTES) {
return null;
}
const mediaType = mimeTypeForPath(filePath);
const base64Data = buf.toString("base64");
return {
type: "audio",
source: { type: "base64", media_type: mediaType, data: base64Data },
};
} catch {
return null;
}
}

View File

@@ -74,6 +74,7 @@ import { injectTimestamp, timestampOptsFromConfig } from "./agent-timestamp.js";
import { setGatewayDedupeEntry } from "./agent-wait-dedupe.js";
import { normalizeRpcAttachmentsToChatAttachments } from "./attachment-normalize.js";
import { appendInjectedAssistantMessageToTranscript } from "./chat-transcript-inject.js";
import { buildWebchatAudioContentBlocksFromReplyPayloads } from "./chat-webchat-media.js";
import type {
GatewayRequestContext,
GatewayRequestHandlerOptions,
@@ -856,6 +857,8 @@ function transcriptHasIdempotencyKey(transcriptPath: string, idempotencyKey: str
function appendAssistantTranscriptMessage(params: {
message: string;
/** Rich Pi message blocks (text, embedded audio, etc.). Overrides plain `message` when set. */
content?: Array<Record<string, unknown>>;
label?: string;
sessionId: string;
storePath: string | undefined;
@@ -900,6 +903,7 @@ function appendAssistantTranscriptMessage(params: {
transcriptPath,
message: params.message,
label: params.label,
content: params.content,
idempotencyKey: params.idempotencyKey,
abortMeta: params.abortMeta,
});
@@ -1788,20 +1792,33 @@ export const chatHandlers: GatewayRequestHandlers = {
sessionKey,
});
} else {
const combinedReply = deliveredReplies
const finalPayloads = deliveredReplies
.filter((entry) => entry.kind === "final")
.map((entry) => entry.payload)
.map((entry) => entry.payload);
const combinedReply = finalPayloads
.map((part) => part.text?.trim() ?? "")
.filter(Boolean)
.join("\n\n")
.trim();
let message: Record<string, unknown> | undefined;
const audioBlocks = buildWebchatAudioContentBlocksFromReplyPayloads(finalPayloads);
const assistantContent: Array<Record<string, unknown>> = [];
if (combinedReply) {
assistantContent.push({ type: "text", text: combinedReply });
} else if (audioBlocks.length > 0) {
assistantContent.push({ type: "text", text: "Audio reply" });
}
assistantContent.push(...audioBlocks);
let message: Record<string, unknown> | undefined;
if (assistantContent.length > 0) {
const { storePath: latestStorePath, entry: latestEntry } =
loadSessionEntry(sessionKey);
const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
const transcriptFallbackText =
combinedReply || (audioBlocks.length > 0 ? "Audio reply" : "");
const appended = appendAssistantTranscriptMessage({
message: combinedReply,
message: transcriptFallbackText,
content: assistantContent,
sessionId,
storePath: latestStorePath,
sessionFile: latestEntry?.sessionFile,
@@ -1817,7 +1834,7 @@ export const chatHandlers: GatewayRequestHandlers = {
const now = Date.now();
message = {
role: "assistant",
content: [{ type: "text", text: combinedReply }],
content: assistantContent,
timestamp: now,
// Keep this compatible with Pi stopReason enums even though this message isn't
// persisted to the transcript due to the append failure.

View File

@@ -305,6 +305,20 @@
justify-content: flex-end;
}
/* Embedded audio (e.g. gateway-injected TTS from slash commands) */
.chat-message-audio {
display: flex;
flex-direction: column;
gap: 8px;
margin-bottom: 8px;
max-width: min(420px, 100%);
}
.chat-message-audio-el {
width: 100%;
min-height: 36px;
}
/* Compose input row - horizontal layout */
.chat-compose__row {
display: flex;

View File

@@ -23,6 +23,10 @@ type ImageBlock = {
alt?: string;
};
type AudioClip = {
url: string;
};
function extractImages(message: unknown): ImageBlock[] {
const m = message as Record<string, unknown>;
const content = m.content;
@@ -60,6 +64,32 @@ function extractImages(message: unknown): ImageBlock[] {
return images;
}
function extractAudioClips(message: unknown): AudioClip[] {
const m = message as Record<string, unknown>;
const content = m.content;
const clips: AudioClip[] = [];
if (!Array.isArray(content)) {
return clips;
}
for (const block of content) {
if (typeof block !== "object" || block === null) {
continue;
}
const b = block as Record<string, unknown>;
if (b.type !== "audio") {
continue;
}
const source = b.source as Record<string, unknown> | undefined;
if (source?.type === "base64" && typeof source.data === "string") {
const data = source.data;
const mediaType = (source.media_type as string) || "audio/mpeg";
const url = data.startsWith("data:") ? data : `data:${mediaType};base64,${data}`;
clips.push({ url });
}
}
return clips;
}
export function renderReadingIndicatorGroup(assistant?: AssistantIdentity, basePath?: string) {
return html`
<div class="chat-group assistant">
@@ -580,6 +610,25 @@ function renderMessageImages(images: ImageBlock[]) {
`;
}
function renderMessageAudio(clips: AudioClip[]) {
if (clips.length === 0) {
return nothing;
}
return html`
<div class="chat-message-audio">
${clips.map(
(clip) =>
html`<audio
class="chat-message-audio-el"
controls
preload="metadata"
src=${clip.url}
></audio>`,
)}
</div>
`;
}
/** Render tool cards inside a collapsed `<details>` element. */
function renderCollapsedToolCards(
toolCards: ToolCard[],
@@ -688,6 +737,8 @@ function renderGroupedMessage(
const hasToolCards = toolCards.length > 0;
const images = extractImages(message);
const hasImages = images.length > 0;
const audioClips = extractAudioClips(message);
const hasAudio = audioClips.length > 0;
const extractedText = extractTextCached(message);
const extractedThinking =
@@ -711,7 +762,7 @@ function renderGroupedMessage(
// Suppress empty bubbles when tool cards are the only content and toggle is off
const visibleToolCards = hasToolCards && (opts.showToolCalls ?? true);
if (!markdown && !visibleToolCards && !hasImages) {
if (!markdown && !visibleToolCards && !hasImages && !hasAudio) {
return nothing;
}
@@ -747,7 +798,7 @@ function renderGroupedMessage(
: nothing}
</summary>
<div class="chat-tool-msg-body">
${renderMessageImages(images)}
${renderMessageImages(images)} ${renderMessageAudio(audioClips)}
${reasoningMarkdown
? html`<div class="chat-thinking">
${unsafeHTML(toSanitizedMarkdownHtml(reasoningMarkdown))}
@@ -771,7 +822,7 @@ function renderGroupedMessage(
</details>
`
: html`
${renderMessageImages(images)}
${renderMessageImages(images)} ${renderMessageAudio(audioClips)}
${reasoningMarkdown
? html`<div class="chat-thinking">
${unsafeHTML(toSanitizedMarkdownHtml(reasoningMarkdown))}