diff --git a/CHANGELOG.md b/CHANGELOG.md index 90127a318c3..0ff9def884d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Gateway/startup: keep hot Gateway boot paths on leaf config imports and add max-RSS reporting to the gateway startup bench so low-memory startup regressions are visible before release. Thanks @vincentkoc. +- WebChat/TTS: persist automatic final-mode TTS audio as a supplemental audio-only transcript update instead of adding a second assistant message with the same visible text. Fixes #72830. Thanks @lhtpluto. - Agents/LSP: terminate bundled stdio LSP process trees during runtime disposal and Gateway shutdown, so nested children such as `tsserver` do not survive stop or restart. Fixes #72357. Thanks @ai-hpc and @bittoby. - Diagnostics/OTEL: capture privacy-safe model-call request payload bytes, streamed response bytes, first-response latency, and total duration in diagnostic events, plugin hooks, stability snapshots, and OTEL model-call spans/metrics without logging raw model content. Fixes #33832. Thanks @wwh830. - Logging: write validated diagnostic trace context as top-level `traceId`, `spanId`, `parentSpanId`, and `traceFlags` fields in file-log JSONL records so traced requests and model calls are easier to correlate in log processors. Refs #40353. Thanks @liangruochong44-ui. diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 96c8fa10d46..d998dd56dde 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -156,6 +156,7 @@ async function expectTtsPayloadResult(params: { expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target })); expect(result.audioAsVoice).toBe(params.audioAsVoice); expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`)); + expect(result.spokenText).toBe(params.text); mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; } finally { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 9fbbc4db811..2a3a883ecb4 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -1591,6 +1591,7 @@ export async function maybeApplyTtsToPayload(params: { ...nextPayload, mediaUrl: result.audioPath, audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice, + spokenText: textForAudio, }; } diff --git a/src/gateway/server-methods/chat.directive-tags.test.ts b/src/gateway/server-methods/chat.directive-tags.test.ts index e7dacc03d0a..41154f2063c 100644 --- a/src/gateway/server-methods/chat.directive-tags.test.ts +++ b/src/gateway/server-methods/chat.directive-tags.test.ts @@ -23,6 +23,10 @@ const mockState = vi.hoisted(() => ({ finalPayload: null as { text?: string; mediaUrl?: string; + mediaUrls?: string[]; + spokenText?: string; + audioAsVoice?: boolean; + trustedLocalMedia?: boolean; sensitiveMedia?: boolean; replyToId?: string; replyToCurrent?: boolean; @@ -34,6 +38,8 @@ const mockState = vi.hoisted(() => ({ text?: string; mediaUrl?: string; mediaUrls?: string[]; + spokenText?: string; + audioAsVoice?: boolean; trustedLocalMedia?: boolean; replyToId?: string; replyToCurrent?: boolean; @@ -113,6 +119,10 @@ vi.mock("../../auto-reply/dispatch.js", () => ({ sendFinalReply: (payload: { text?: string; mediaUrl?: string; + mediaUrls?: string[]; + spokenText?: string; + audioAsVoice?: boolean; + trustedLocalMedia?: boolean; sensitiveMedia?: boolean; replyToId?: string; replyToCurrent?: boolean; @@ -122,6 +132,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({ text?: string; mediaUrl?: string; mediaUrls?: string[]; + spokenText?: string; + audioAsVoice?: boolean; trustedLocalMedia?: boolean; replyToId?: string; replyToCurrent?: boolean; @@ -131,6 +143,8 @@ vi.mock("../../auto-reply/dispatch.js", () => ({ text?: string; mediaUrl?: string; mediaUrls?: string[]; + spokenText?: string; + audioAsVoice?: boolean; trustedLocalMedia?: boolean; replyToId?: string; replyToCurrent?: boolean; @@ -257,6 +271,7 @@ function createTranscriptFixture(prefix: string) { "utf-8", ); mockState.transcriptPath = transcriptPath; + return dir; } function extractFirstTextBlock(payload: unknown): string | undefined { @@ -579,6 +594,121 @@ describe("chat directive tag stripping for non-streaming final payloads", () => }); }); + it("persists auto-TTS final media as audio-only so webchat does not duplicate assistant text", async () => { + const transcriptDir = createTranscriptFixture("openclaw-chat-send-agent-tts-final-"); + const audioPath = path.join(transcriptDir, "tts.mp3"); + fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + mockState.config = { + agents: { + defaults: { + workspace: transcriptDir, + }, + }, + }; + mockState.triggerAgentRunStart = true; + mockState.dispatchedReplies = [ + { + kind: "final", + payload: { + text: "This text is already in the model transcript.", + spokenText: "This text is already in the model transcript.", + mediaUrl: audioPath, + mediaUrls: [audioPath], + trustedLocalMedia: true, + audioAsVoice: true, + }, + }, + ]; + const respond = vi.fn(); + const context = createChatContext(); + + await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-agent-tts", + expectBroadcast: false, + waitFor: "dedupe", + }); + + const assistantUpdates = mockState.emittedTranscriptUpdates.filter( + (update) => + typeof update.message === "object" && + update.message !== null && + (update.message as { role?: unknown }).role === "assistant", + ); + expect(assistantUpdates).toHaveLength(1); + expect(assistantUpdates[0]).toMatchObject({ + message: { + role: "assistant", + idempotencyKey: "idem-agent-tts:assistant-media", + content: [ + { type: "text", text: "Audio reply" }, + { + type: "audio", + source: { + type: "base64", + media_type: "audio/mpeg", + }, + }, + ], + }, + }); + expect(JSON.stringify(assistantUpdates[0]?.message)).not.toContain( + "This text is already in the model transcript.", + ); + }); + + it("keeps visible text on non-agent TTS final media because no model transcript exists", async () => { + const transcriptDir = createTranscriptFixture("openclaw-chat-send-command-tts-final-"); + const audioPath = path.join(transcriptDir, "tts.mp3"); + fs.writeFileSync(audioPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + mockState.config = { + agents: { + defaults: { + workspace: transcriptDir, + }, + }, + }; + mockState.finalPayload = { + text: "Command result with TTS.", + spokenText: "Command result with TTS.", + mediaUrl: audioPath, + mediaUrls: [audioPath], + trustedLocalMedia: true, + audioAsVoice: true, + }; + const respond = vi.fn(); + const context = createChatContext(); + + const payload = await runNonStreamingChatSend({ + context, + respond, + idempotencyKey: "idem-command-tts", + }); + + expect(payload?.message).toMatchObject({ + role: "assistant", + content: [ + { type: "text", text: "Command result with TTS." }, + { + type: "audio", + source: { + type: "base64", + media_type: "audio/mpeg", + }, + }, + ], + }); + const assistantUpdates = mockState.emittedTranscriptUpdates.filter( + (update) => + typeof update.message === "object" && + update.message !== null && + (update.message as { role?: unknown }).role === "assistant", + ); + expect(assistantUpdates).toHaveLength(1); + expect(JSON.stringify(assistantUpdates[0]?.message)).toContain("Command result with TTS."); + }); + it("renders image reply payloads as assistant image content instead of MEDIA text", async () => { createTranscriptFixture("openclaw-chat-send-agent-image-"); mockState.finalPayload = { diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index eca437d3edd..df2f3698e85 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -143,6 +143,18 @@ function isMediaBearingPayload(payload: ReplyPayload): boolean { return false; } +function isTtsSupplementPayload(payload: ReplyPayload): boolean { + return ( + typeof payload.spokenText === "string" && + payload.spokenText.trim().length > 0 && + isMediaBearingPayload(payload) + ); +} + +function stripVisibleTextFromTtsSupplement(payload: ReplyPayload): ReplyPayload { + return isTtsSupplementPayload(payload) ? { ...payload, text: undefined } : payload; +} + async function buildWebchatAssistantMediaMessage( payloads: ReplyPayload[], options?: { @@ -2008,6 +2020,7 @@ export const chatHandlers: GatewayRequestHandlers = { if (!agentRunStarted || appendedWebchatAgentMedia || !isMediaBearingPayload(payload)) { return; } + const transcriptPayload = stripVisibleTextFromTtsSupplement(payload); const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey); const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId; const resolvedTranscriptPath = resolveTranscriptPath({ @@ -2022,9 +2035,9 @@ export const chatHandlers: GatewayRequestHandlers = { ); const assistantContent = await buildAssistantDisplayContentFromReplyPayloads({ sessionKey, - payloads: [payload], + payloads: [transcriptPayload], managedImageLocalRoots: mediaLocalRoots, - includeSensitiveMedia: payload.sensitiveMedia !== true, + includeSensitiveMedia: transcriptPayload.sensitiveMedia !== true, onLocalAudioAccessDenied: (message) => { context.logGateway.warn(`webchat audio embedding denied local path: ${message}`); }, @@ -2032,7 +2045,7 @@ export const chatHandlers: GatewayRequestHandlers = { context.logGateway.warn(`webchat image embedding skipped attachment: ${message}`); }, }); - const mediaMessage = await buildWebchatAssistantMediaMessage([payload], { + const mediaMessage = await buildWebchatAssistantMediaMessage([transcriptPayload], { localRoots: mediaLocalRoots, onLocalAudioAccessDenied: (message) => { context.logGateway.warn(`webchat audio embedding denied local path: ${message}`); @@ -2048,7 +2061,7 @@ export const chatHandlers: GatewayRequestHandlers = { const transcriptReply = mediaMessage?.transcriptText ?? extractAssistantDisplayTextFromContent(assistantContent) ?? - buildTranscriptReplyText([payload]); + buildTranscriptReplyText([transcriptPayload]); if (!transcriptReply && !persistedAssistantContent?.length && !assistantContent?.length) { return; } @@ -2176,9 +2189,11 @@ export const chatHandlers: GatewayRequestHandlers = { sessionKey, }); } else { - const finalPayloads = deliveredReplies - .filter((entry) => entry.kind === "final") - .map((entry) => entry.payload); + const finalPayloads = appendedWebchatAgentMedia + ? [] + : deliveredReplies + .filter((entry) => entry.kind === "final") + .map((entry) => entry.payload); const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(sessionKey); const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;