fix(tts): normalize streamed tts voice media

This commit is contained in:
Peter Steinberger
2026-04-26 04:28:08 +01:00
parent f4e6322649
commit 2c8c79de5c
4 changed files with 50 additions and 2 deletions

View File

@@ -71,6 +71,9 @@ Docs: https://docs.openclaw.ai
- TTS: keep explicit `provider=...` directive keys scoped to that provider and
warn on unsupported keys instead of letting another speech provider consume
overlapping keys. Fixes #60131.
- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
generated voice-note files use the same safe media path and native voice
routing as normal final replies. Fixes #71920.
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
plain prompts instead of OpenClaw internal runtime-context envelopes, while
keeping those envelopes out of ACP transcripts.

View File

@@ -797,6 +797,9 @@ When enabled, OpenClaw:
- skips very short replies (< 10 chars).
- summarizes long replies when enabled using `agents.defaults.model.primary` (or `summaryModel`).
- attaches the generated audio to the reply.
- in `mode: "final"`, still sends audio-only TTS for streamed final replies
after the text stream completes; the generated media goes through the same
channel media normalization as normal reply attachments.
If the reply exceeds `maxLength` and summary is off (or no API key for the
summary model), audio

View File

@@ -2352,6 +2352,47 @@ describe("dispatchReplyFromConfig", () => {
expect(finalPayload?.text).toBeUndefined();
});
it("normalizes accumulated block TTS-only media before final delivery", async () => {
setNoAbort();
ttsMocks.state.synthesizeFinalAudio = true;
replyMediaPathMocks.createReplyMediaPathNormalizer.mockReturnValue(
async (payload: ReplyPayload) => ({
...payload,
mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg",
mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"],
}),
);
const dispatcher = createDispatcher();
const ctx = buildTestCtx({
Provider: "feishu",
Surface: "feishu",
SessionKey: "agent:main:feishu:ou_user",
});
const replyResolver = async (
_ctx: MsgContext,
opts?: GetReplyOptions,
): Promise<ReplyPayload | undefined> => {
await opts?.onBlockReply?.({ text: "Hello from block streaming." });
return undefined;
};
await dispatchReplyFromConfig({ ctx, cfg: emptyConfig, dispatcher, replyResolver });
expect(replyMediaPathMocks.createReplyMediaPathNormalizer).toHaveBeenCalledWith(
expect.objectContaining({
messageProvider: "feishu",
}),
);
expect(dispatcher.sendFinalReply).toHaveBeenCalledWith(
expect.objectContaining({
mediaUrl: "/tmp/openclaw-media/normalized-tts.ogg",
mediaUrls: ["/tmp/openclaw-media/normalized-tts.ogg"],
audioAsVoice: true,
spokenText: "Hello from block streaming.",
}),
);
});
it("closes oneshot ACP sessions after the turn completes", async () => {
setNoAbort();
const runtime = createAcpRuntime([{ type: "done" }]);

View File

@@ -1227,7 +1227,8 @@ export async function dispatchReplyFromConfig(
audioAsVoice: ttsSyntheticReply.audioAsVoice,
spokenText: accumulatedBlockTtsText,
};
const result = await routeReplyToOriginating(ttsOnlyPayload);
const normalizedTtsOnlyPayload = await normalizeReplyMediaPayload(ttsOnlyPayload);
const result = await routeReplyToOriginating(normalizedTtsOnlyPayload);
if (result) {
queuedFinal = result.ok || queuedFinal;
if (result.ok) {
@@ -1239,7 +1240,7 @@ export async function dispatchReplyFromConfig(
);
}
} else {
const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload);
const didQueue = dispatcher.sendFinalReply(normalizedTtsOnlyPayload);
queuedFinal = didQueue || queuedFinal;
}
}