From 2c8c79de5c778d25eba6a65c6fcbf38c107f42a9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 26 Apr 2026 04:28:08 +0100 Subject: [PATCH] fix(tts): normalize streamed tts voice media --- CHANGELOG.md | 3 ++ docs/tools/tts.md | 3 ++ .../reply/dispatch-from-config.test.ts | 41 +++++++++++++++++++ src/auto-reply/reply/dispatch-from-config.ts | 5 ++- 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c5d6c7c287..d5c97f5dd14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,9 @@ Docs: https://docs.openclaw.ai - TTS: keep explicit `provider=...` directive keys scoped to that provider and warn on unsupported keys instead of letting another speech provider consume overlapping keys. Fixes #60131. +- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so + generated voice-note files use the same safe media path and native voice + routing as normal final replies. Fixes #71920. - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index e9e7c385816..98eafaedcab 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -797,6 +797,9 @@ When enabled, OpenClaw: - skips very short replies (< 10 chars). - summarizes long replies when enabled using `agents.defaults.model.primary` (or `summaryModel`). - attaches the generated audio to the reply. +- in `mode: "final"`, still sends audio-only TTS for streamed final replies + after the text stream completes; the generated media goes through the same + channel media normalization as normal reply attachments. If the reply exceeds `maxLength` and summary is off (or no API key for the summary model), audio diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index 0209b49753f..ec58c5c9888 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -2352,6 +2352,47 @@ describe("dispatchReplyFromConfig", () => { expect(finalPayload?.text).toBeUndefined(); }); + it("normalizes accumulated block TTS-only media before final delivery", async () => { + setNoAbort(); + ttsMocks.state.synthesizeFinalAudio = true; + replyMediaPathMocks.createReplyMediaPathNormalizer.mockReturnValue( + async (payload: ReplyPayload) => ({ + ...payload, + mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg", + mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"], + }), + ); + const dispatcher = createDispatcher(); + const ctx = buildTestCtx({ + Provider: "feishu", + Surface: "feishu", + SessionKey: "agent:main:feishu:ou_user", + }); + const replyResolver = async ( + _ctx: MsgContext, + opts?: GetReplyOptions, + ): Promise => { + await opts?.onBlockReply?.({ text: "Hello from block streaming." }); + return undefined; + }; + + await dispatchReplyFromConfig({ ctx, cfg: emptyConfig, dispatcher, replyResolver }); + + expect(replyMediaPathMocks.createReplyMediaPathNormalizer).toHaveBeenCalledWith( + expect.objectContaining({ + messageProvider: "feishu", + }), + ); + expect(dispatcher.sendFinalReply).toHaveBeenCalledWith( + expect.objectContaining({ + mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg", + mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"], + audioAsVoice: true, + spokenText: "Hello from block streaming.", + }), + ); + }); + it("closes oneshot ACP sessions after the turn completes", async () => { setNoAbort(); const runtime = createAcpRuntime([{ type: "done" }]); diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 396e939ae3c..413ec418f60 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -1227,7 +1227,8 @@ export async function dispatchReplyFromConfig( audioAsVoice: ttsSyntheticReply.audioAsVoice, spokenText: accumulatedBlockTtsText, }; - const result = await routeReplyToOriginating(ttsOnlyPayload); + const normalizedTtsOnlyPayload = await normalizeReplyMediaPayload(ttsOnlyPayload); + const result = await routeReplyToOriginating(normalizedTtsOnlyPayload); if (result) { queuedFinal = result.ok || queuedFinal; if (result.ok) { @@ -1239,7 +1240,7 @@ export async function dispatchReplyFromConfig( ); } } else { - const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload); + const didQueue = dispatcher.sendFinalReply(normalizedTtsOnlyPayload); queuedFinal = didQueue || queuedFinal; } }