fix(tts): keep final webchat audio supplemental

This commit is contained in:
Peter Steinberger
2026-04-27 20:22:18 +01:00
parent d2b0ff808a
commit 7829c438a6
5 changed files with 155 additions and 7 deletions

View File

@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Gateway/startup: keep hot Gateway boot paths on leaf config imports and add max-RSS reporting to the gateway startup bench so low-memory startup regressions are visible before release. Thanks @vincentkoc.
- WebChat/TTS: persist automatic final-mode TTS audio as a supplemental audio-only transcript update instead of adding a second assistant message with the same visible text. Fixes #72830. Thanks @lhtpluto.
- Agents/LSP: terminate bundled stdio LSP process trees during runtime disposal and Gateway shutdown, so nested children such as `tsserver` do not survive stop or restart. Fixes #72357. Thanks @ai-hpc and @bittoby.
- Diagnostics/OTEL: capture privacy-safe model-call request payload bytes, streamed response bytes, first-response latency, and total duration in diagnostic events, plugin hooks, stability snapshots, and OTEL model-call spans/metrics without logging raw model content. Fixes #33832. Thanks @wwh830.
- Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui.

View File

@@ -156,6 +156,7 @@ async function expectTtsPayloadResult(params: {
expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target }));
expect(result.audioAsVoice).toBe(params.audioAsVoice);
expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`));
expect(result.spokenText).toBe(params.text);
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
} finally {

View File

@@ -1591,6 +1591,7 @@ export async function maybeApplyTtsToPayload(params: {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
spokenText: textForAudio,
};
}

View File

@@ -23,6 +23,10 @@ const mockState = vi.hoisted(() => ({
finalPayload: null as {
text?: string;
mediaUrl?: string;
mediaUrls?: string[];
spokenText?: string;
audioAsVoice?: boolean;
trustedLocalMedia?: boolean;
sensitiveMedia?: boolean;
replyToId?: string;
replyToCurrent?: boolean;
@@ -34,6 +38,8 @@ const mockState = vi.hoisted(() => ({
text?: string;
mediaUrl?: string;
mediaUrls?: string[];
spokenText?: string;
audioAsVoice?: boolean;
trustedLocalMedia?: boolean;
replyToId?: string;
replyToCurrent?: boolean;
@@ -113,6 +119,10 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
sendFinalReply: (payload: {
text?: string;
mediaUrl?: string;
mediaUrls?: string[];
spokenText?: string;
audioAsVoice?: boolean;
trustedLocalMedia?: boolean;
sensitiveMedia?: boolean;
replyToId?: string;
replyToCurrent?: boolean;
@@ -122,6 +132,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
text?: string;
mediaUrl?: string;
mediaUrls?: string[];
spokenText?: string;
audioAsVoice?: boolean;
trustedLocalMedia?: boolean;
replyToId?: string;
replyToCurrent?: boolean;
@@ -131,6 +143,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({
text?: string;
mediaUrl?: string;
mediaUrls?: string[];
spokenText?: string;
audioAsVoice?: boolean;
trustedLocalMedia?: boolean;
replyToId?: string;
replyToCurrent?: boolean;
@@ -257,6 +271,7 @@ function createTranscriptFixture(prefix: string) {
"utf-8",
);
mockState.transcriptPath = transcriptPath;
return dir;
}
function extractFirstTextBlock(payload: unknown): string | undefined {
@@ -579,6 +594,121 @@ describe("chat directive tag stripping for non-streaming final payloads", () =>
});
});
it("persists auto-TTS final media as audio-only so webchat does not duplicate assistant text", async () => {
const transcriptDir = createTranscriptFixture("openclaw-chat-send-agent-tts-final-");
const audioPath = path.join(transcriptDir, "tts.mp3");
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
mockState.config = {
agents: {
defaults: {
workspace: transcriptDir,
},
},
};
mockState.triggerAgentRunStart = true;
mockState.dispatchedReplies = [
{
kind: "final",
payload: {
text: "This text is already in the model transcript.",
spokenText: "This text is already in the model transcript.",
mediaUrl: audioPath,
mediaUrls: [audioPath],
trustedLocalMedia: true,
audioAsVoice: true,
},
},
];
const respond = vi.fn();
const context = createChatContext();
await runNonStreamingChatSend({
context,
respond,
idempotencyKey: "idem-agent-tts",
expectBroadcast: false,
waitFor: "dedupe",
});
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
(update) =>
typeof update.message === "object" &&
update.message !== null &&
(update.message as { role?: unknown }).role === "assistant",
);
expect(assistantUpdates).toHaveLength(1);
expect(assistantUpdates[0]).toMatchObject({
message: {
role: "assistant",
idempotencyKey: "idem-agent-tts:assistant-media",
content: [
{ type: "text", text: "Audio reply" },
{
type: "audio",
source: {
type: "base64",
media_type: "audio/mpeg",
},
},
],
},
});
expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain(
"This text is already in the model transcript.",
);
});
it("keeps visible text on non-agent TTS final media because no model transcript exists", async () => {
const transcriptDir = createTranscriptFixture("openclaw-chat-send-command-tts-final-");
const audioPath = path.join(transcriptDir, "tts.mp3");
fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
mockState.config = {
agents: {
defaults: {
workspace: transcriptDir,
},
},
};
mockState.finalPayload = {
text: "Command result with TTS.",
spokenText: "Command result with TTS.",
mediaUrl: audioPath,
mediaUrls: [audioPath],
trustedLocalMedia: true,
audioAsVoice: true,
};
const respond = vi.fn();
const context = createChatContext();
const payload = await runNonStreamingChatSend({
context,
respond,
idempotencyKey: "idem-command-tts",
});
expect(payload?.message).toMatchObject({
role: "assistant",
content: [
{ type: "text", text: "Command result with TTS." },
{
type: "audio",
source: {
type: "base64",
media_type: "audio/mpeg",
},
},
],
});
const assistantUpdates = mockState.emittedTranscriptUpdates.filter(
(update) =>
typeof update.message === "object" &&
update.message !== null &&
(update.message as { role?: unknown }).role === "assistant",
);
expect(assistantUpdates).toHaveLength(1);
expect(JSON.stringify(assistantUpdates[0]?.message)).toContain("Command result with TTS.");
});
it("renders image reply payloads as assistant image content instead of MEDIA text", async () => {
createTranscriptFixture("openclaw-chat-send-agent-image-");
mockState.finalPayload = {

View File

@@ -143,6 +143,18 @@ function isMediaBearingPayload(payload: ReplyPayload): boolean {
return false;
}
function isTtsSupplementPayload(payload: ReplyPayload): boolean {
return (
typeof payload.spokenText === "string" &&
payload.spokenText.trim().length > 0 &&
isMediaBearingPayload(payload)
);
}
function stripVisibleTextFromTtsSupplement(payload: ReplyPayload): ReplyPayload {
return isTtsSupplementPayload(payload) ? { ...payload, text: undefined } : payload;
}
async function buildWebchatAssistantMediaMessage(
payloads: ReplyPayload[],
options?: {
@@ -2008,6 +2020,7 @@ export const chatHandlers: GatewayRequestHandlers = {
if (!agentRunStarted || appendedWebchatAgentMedia || !isMediaBearingPayload(payload)) {
return;
}
const transcriptPayload = stripVisibleTextFromTtsSupplement(payload);
const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey);
const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
const resolvedTranscriptPath = resolveTranscriptPath({
@@ -2022,9 +2035,9 @@ export const chatHandlers: GatewayRequestHandlers = {
);
const assistantContent = await buildAssistantDisplayContentFromReplyPayloads({
sessionKey,
payloads: [payload],
payloads: [transcriptPayload],
managedImageLocalRoots: mediaLocalRoots,
includeSensitiveMedia: payload.sensitiveMedia !== true,
includeSensitiveMedia: transcriptPayload.sensitiveMedia !== true,
onLocalAudioAccessDenied: (message) => {
context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
},
@@ -2032,7 +2045,7 @@ export const chatHandlers: GatewayRequestHandlers = {
context.logGateway.warn(`webchat image embedding skipped attachment: ${message}`);
},
});
const mediaMessage = await buildWebchatAssistantMediaMessage([payload], {
const mediaMessage = await buildWebchatAssistantMediaMessage([transcriptPayload], {
localRoots: mediaLocalRoots,
onLocalAudioAccessDenied: (message) => {
context.logGateway.warn(`webchat audio embedding denied local path: ${message}`);
@@ -2048,7 +2061,7 @@ export const chatHandlers: GatewayRequestHandlers = {
const transcriptReply =
mediaMessage?.transcriptText ??
extractAssistantDisplayTextFromContent(assistantContent) ??
buildTranscriptReplyText([payload]);
buildTranscriptReplyText([transcriptPayload]);
if (!transcriptReply && !persistedAssistantContent?.length && !assistantContent?.length) {
return;
}
@@ -2176,9 +2189,11 @@ export const chatHandlers: GatewayRequestHandlers = {
sessionKey,
});
} else {
const finalPayloads = deliveredReplies
.filter((entry) => entry.kind === "final")
.map((entry) => entry.payload);
const finalPayloads = appendedWebchatAgentMedia
? []
: deliveredReplies
.filter((entry) => entry.kind === "final")
.map((entry) => entry.payload);
const { storePath: latestStorePath, entry: latestEntry } =
loadSessionEntry(sessionKey);
const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;