From eec18fccb4ef6474eec4933217192d00ba303ddd Mon Sep 17 00:00:00 2001 From: wuyangfan <1102042793@qq.com> Date: Sun, 17 May 2026 02:54:39 +0800 Subject: [PATCH] fix(webchat): forward trustedLocalMedia on accumulated block TTS tail Avoid per-block final-mode synthesis (duplicate with dispatch tail). Mark TTS output as trusted local media and pass the flag through the TTS-only final payload WebChat consumes after block streaming. Fixes #82628 Co-authored-by: Cursor --- extensions/speech-core/src/tts.test.ts | 16 +++-- extensions/speech-core/src/tts.ts | 2 +- scripts/repro/webchat-auto-tts-live-proof.mjs | 58 +++++++++++-------- .../reply/dispatch-from-config.test.ts | 2 + src/auto-reply/reply/dispatch-from-config.ts | 1 + 5 files changed, 48 insertions(+), 31 deletions(-) diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 28961dbbb4c..ba5a0290cb2 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -442,15 +442,19 @@ describe("speech-core native voice-note routing", () => { } }); - it("applies TTS for block delivery kind in final mode (#82628)", async () => { - await expectTtsPayloadResult({ + it("skips block delivery kind in final mode (accumulated final tail synthesizes instead)", async () => { + synthesizeMock.mockClear(); + const cfg = createTtsConfig("openclaw-speech-core-block-kind-tts-test"); + const result = await maybeApplyTtsToPayload({ + payload: { text: "WebChat block stream chunks defer TTS to the final tail." }, + cfg, channel: "webchat", - prefsName: "openclaw-speech-core-block-kind-tts-test", - text: "WebChat block replies should synthesize audio for auto TTS.", - target: "audio-file", - audioAsVoice: undefined, kind: "block", }); + + expect(synthesizeMock).not.toHaveBeenCalled(); + expect(result.trustedLocalMedia).toBeUndefined(); + expect(result.text).toBe("WebChat block stream chunks defer TTS to the final tail."); }); it("skips tool delivery kind in final mode", async () => { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 54609ec783d..0638a1caf5c 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -1759,7 +1759,7 @@ export async function maybeApplyTtsToPayload(params: { } const mode = config.mode ?? "final"; - if (mode === "final" && params.kind && params.kind !== "final" && params.kind !== "block") { + if (mode === "final" && params.kind && params.kind !== "final") { return nextPayload; } diff --git a/scripts/repro/webchat-auto-tts-live-proof.mjs b/scripts/repro/webchat-auto-tts-live-proof.mjs index 19e040caf90..a74324fd8fd 100644 --- a/scripts/repro/webchat-auto-tts-live-proof.mjs +++ b/scripts/repro/webchat-auto-tts-live-proof.mjs @@ -62,52 +62,62 @@ async function main() { }, }; - const blockText = "WebChat block replies should synthesize audio for auto TTS."; + const accumulatedBlockText = + "WebChat streams block text; dispatch synthesizes one TTS tail with kind final."; const blockResult = await maybeApplyTtsToPayload({ - payload: { text: blockText }, + payload: { text: accumulatedBlockText }, cfg, channel: "webchat", kind: "block", }); console.log("maybeApplyTtsToPayload(kind=block).mediaUrl =", blockResult.mediaUrl ?? "(none)"); - console.log( - "maybeApplyTtsToPayload(kind=block).trustedLocalMedia =", - blockResult.trustedLocalMedia ?? false, - ); - const toolResult = await maybeApplyTtsToPayload({ - payload: { text: "Intermediate tool output should not be spoken." }, + const tailResult = await maybeApplyTtsToPayload({ + payload: { text: accumulatedBlockText }, cfg, channel: "webchat", - kind: "tool", + kind: "final", }); - console.log("maybeApplyTtsToPayload(kind=tool).mediaUrl =", toolResult.mediaUrl ?? "(none)"); - - const mediaPath = blockResult.mediaUrl; - if (!mediaPath || !fs.existsSync(mediaPath)) { - throw new Error("expected block TTS to write a local media file"); - } - const localRoots = [path.dirname(mediaPath)]; - const trustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads( - [{ mediaUrl: mediaPath, trustedLocalMedia: true }], - { localRoots }, + console.log("maybeApplyTtsToPayload(kind=final).mediaUrl =", tailResult.mediaUrl ?? "(none)"); + console.log( + "maybeApplyTtsToPayload(kind=final).trustedLocalMedia =", + tailResult.trustedLocalMedia ?? false, ); + + const mediaPath = tailResult.mediaUrl; + if (!mediaPath || !fs.existsSync(mediaPath)) { + throw new Error("expected final-mode tail TTS to write a local media file"); + } + + const ttsOnlyPayload = { + mediaUrl: tailResult.mediaUrl, + audioAsVoice: tailResult.audioAsVoice, + spokenText: accumulatedBlockText, + trustedLocalMedia: tailResult.trustedLocalMedia, + }; + console.log( + "dispatch ttsOnlyPayload.trustedLocalMedia =", + ttsOnlyPayload.trustedLocalMedia ?? false, + ); + + const localRoots = [path.dirname(mediaPath)]; + const trustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads([ttsOnlyPayload], { + localRoots, + }); const untrustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads( [{ mediaUrl: mediaPath }], { localRoots }, ); console.log( - "buildWebchatAudioContentBlocksFromReplyPayloads(trustedLocalMedia=true).length =", + "buildWebchatAudioContentBlocksFromReplyPayloads(ttsOnlyPayload).length =", trustedBlocks.length, ); console.log( - "buildWebchatAudioContentBlocksFromReplyPayloads(trustedLocalMedia missing).length =", + "buildWebchatAudioContentBlocksFromReplyPayloads(untrusted).length =", untrustedBlocks.length, ); - if (blockResult.mediaUrl) { - fs.rmSync(path.dirname(blockResult.mediaUrl), { recursive: true, force: true }); - } + fs.rmSync(path.dirname(mediaPath), { recursive: true, force: true }); try { fs.unlinkSync(prefsPath); } catch { diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index 0f09822ed8d..37aa52e98aa 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -153,6 +153,7 @@ const ttsMocks = vi.hoisted(() => { ...params.payload, mediaUrl: "https://example.com/tts-synth.opus", audioAsVoice: true, + trustedLocalMedia: true, }; } return params.payload; @@ -2722,6 +2723,7 @@ describe("dispatchReplyFromConfig", () => { expect(finalPayload?.mediaUrls).toStrictEqual(["/tmp/openclaw-media/normalized-tts.ogg"]); expect(finalPayload?.audioAsVoice).toBe(true); expect(finalPayload?.spokenText).toBe("Hello from block streaming."); + expect(finalPayload?.trustedLocalMedia).toBe(true); }); it("closes oneshot ACP sessions after the turn completes", async () => { diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 54e4a2cd55a..bad231cf6d8 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -1700,6 +1700,7 @@ export async function dispatchReplyFromConfig( mediaUrl: ttsSyntheticReply.mediaUrl, audioAsVoice: ttsSyntheticReply.audioAsVoice, spokenText: accumulatedBlockTtsText, + trustedLocalMedia: ttsSyntheticReply.trustedLocalMedia, }; const normalizedTtsOnlyPayload = await normalizeReplyMediaPayload(ttsOnlyPayload); const result = await routeReplyToOriginating(normalizedTtsOnlyPayload);