fix(tts): normalize streamed tts voice media

2026-05-06 07:20:43 +00:00 · 2026-04-26 04:28:08 +01:00
parent f4e6322649
commit 2c8c79de5c
4 changed files with 50 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -71,6 +71,9 @@ Docs: https://docs.openclaw.ai
 - TTS: keep explicit `provider=...` directive keys scoped to that provider and
  warn on unsupported keys instead of letting another speech provider consume
  overlapping keys. Fixes #60131.
+- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
+  generated voice-note files use the same safe media path and native voice
+  routing as normal final replies. Fixes #71920.
 - ACP: send subagent and async-task completion wakes to external ACP harnesses as
  plain prompts instead of OpenClaw internal runtime-context envelopes, while
  keeping those envelopes out of ACP transcripts.
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -797,6 +797,9 @@ When enabled, OpenClaw:
 - skips very short replies (< 10 chars).
 - summarizes long replies when enabled using `agents.defaults.model.primary` (or `summaryModel`).
 - attaches the generated audio to the reply.
+- in `mode: "final"`, still sends audio-only TTS for streamed final replies
+  after the text stream completes; the generated media goes through the same
+  channel media normalization as normal reply attachments.

 If the reply exceeds `maxLength` and summary is off (or no API key for the
 summary model), audio
--- a/src/auto-reply/reply/dispatch-from-config.test.ts
+++ b/src/auto-reply/reply/dispatch-from-config.test.ts
@@ -2352,6 +2352,47 @@ describe("dispatchReplyFromConfig", () => {
    expect(finalPayload?.text).toBeUndefined();
  });

+  it("normalizes accumulated block TTS-only media before final delivery", async () => {
+    setNoAbort();
+    ttsMocks.state.synthesizeFinalAudio = true;
+    replyMediaPathMocks.createReplyMediaPathNormalizer.mockReturnValue(
+      async (payload: ReplyPayload) => ({
+        ...payload,
+        mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg",
+        mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"],
+      }),
+    );
+    const dispatcher = createDispatcher();
+    const ctx = buildTestCtx({
+      Provider: "feishu",
+      Surface: "feishu",
+      SessionKey: "agent:main:feishu:ou_user",
+    });
+    const replyResolver = async (
+      _ctx: MsgContext,
+      opts?: GetReplyOptions,
+    ): Promise<ReplyPayload | undefined> => {
+      await opts?.onBlockReply?.({ text: "Hello from block streaming." });
+      return undefined;
+    };
+
+    await dispatchReplyFromConfig({ ctx, cfg: emptyConfig, dispatcher, replyResolver });
+
+    expect(replyMediaPathMocks.createReplyMediaPathNormalizer).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messageProvider: "feishu",
+      }),
+    );
+    expect(dispatcher.sendFinalReply).toHaveBeenCalledWith(
+      expect.objectContaining({
+        mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg",
+        mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"],
+        audioAsVoice: true,
+        spokenText: "Hello from block streaming.",
+      }),
+    );
+  });
+
  it("closes oneshot ACP sessions after the turn completes", async () => {
    setNoAbort();
    const runtime = createAcpRuntime([{ type: "done" }]);
--- a/src/auto-reply/reply/dispatch-from-config.ts
+++ b/src/auto-reply/reply/dispatch-from-config.ts
@@ -1227,7 +1227,8 @@ export async function dispatchReplyFromConfig(
              audioAsVoice: ttsSyntheticReply.audioAsVoice,
              spokenText: accumulatedBlockTtsText,
            };
-            const result = await routeReplyToOriginating(ttsOnlyPayload);
+            const normalizedTtsOnlyPayload = await normalizeReplyMediaPayload(ttsOnlyPayload);
+            const result = await routeReplyToOriginating(normalizedTtsOnlyPayload);
            if (result) {
              queuedFinal = result.ok || queuedFinal;
              if (result.ok) {
@@ -1239,7 +1240,7 @@ export async function dispatchReplyFromConfig(
                );
              }
            } else {
-              const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload);
+              const didQueue = dispatcher.sendFinalReply(normalizedTtsOnlyPayload);
              queuedFinal = didQueue || queuedFinal;
            }
          }