fix(webchat): forward trustedLocalMedia on accumulated block TTS tail

Avoid per-block final-mode synthesis (duplicate with dispatch tail). Mark
TTS output as trusted local media and pass the flag through the TTS-only
final payload WebChat consumes after block streaming.

Fixes #82628

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
wuyangfan
2026-05-17 02:54:39 +08:00
committed by Peter Steinberger
parent f8323f8636
commit eec18fccb4
5 changed files with 48 additions and 31 deletions

View File

@@ -442,15 +442,19 @@ describe("speech-core native voice-note routing", () => {
}
});
it("applies TTS for block delivery kind in final mode (#82628)", async () => {
await expectTtsPayloadResult({
it("skips block delivery kind in final mode (accumulated final tail synthesizes instead)", async () => {
synthesizeMock.mockClear();
const cfg = createTtsConfig("openclaw-speech-core-block-kind-tts-test");
const result = await maybeApplyTtsToPayload({
payload: { text: "WebChat block stream chunks defer TTS to the final tail." },
cfg,
channel: "webchat",
prefsName: "openclaw-speech-core-block-kind-tts-test",
text: "WebChat block replies should synthesize audio for auto TTS.",
target: "audio-file",
audioAsVoice: undefined,
kind: "block",
});
expect(synthesizeMock).not.toHaveBeenCalled();
expect(result.trustedLocalMedia).toBeUndefined();
expect(result.text).toBe("WebChat block stream chunks defer TTS to the final tail.");
});
it("skips tool delivery kind in final mode", async () => {

View File

@@ -1759,7 +1759,7 @@ export async function maybeApplyTtsToPayload(params: {
}
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final" && params.kind !== "block") {
if (mode === "final" && params.kind && params.kind !== "final") {
return nextPayload;
}

View File

@@ -62,52 +62,62 @@ async function main() {
},
};
const blockText = "WebChat block replies should synthesize audio for auto TTS.";
const accumulatedBlockText =
"WebChat streams block text; dispatch synthesizes one TTS tail with kind final.";
const blockResult = await maybeApplyTtsToPayload({
payload: { text: blockText },
payload: { text: accumulatedBlockText },
cfg,
channel: "webchat",
kind: "block",
});
console.log("maybeApplyTtsToPayload(kind=block).mediaUrl =", blockResult.mediaUrl ?? "(none)");
console.log(
"maybeApplyTtsToPayload(kind=block).trustedLocalMedia =",
blockResult.trustedLocalMedia ?? false,
);
const toolResult = await maybeApplyTtsToPayload({
payload: { text: "Intermediate tool output should not be spoken." },
const tailResult = await maybeApplyTtsToPayload({
payload: { text: accumulatedBlockText },
cfg,
channel: "webchat",
kind: "tool",
kind: "final",
});
console.log("maybeApplyTtsToPayload(kind=tool).mediaUrl =", toolResult.mediaUrl ?? "(none)");
const mediaPath = blockResult.mediaUrl;
if (!mediaPath || !fs.existsSync(mediaPath)) {
throw new Error("expected block TTS to write a local media file");
}
const localRoots = [path.dirname(mediaPath)];
const trustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
[{ mediaUrl: mediaPath, trustedLocalMedia: true }],
{ localRoots },
console.log("maybeApplyTtsToPayload(kind=final).mediaUrl =", tailResult.mediaUrl ?? "(none)");
console.log(
"maybeApplyTtsToPayload(kind=final).trustedLocalMedia =",
tailResult.trustedLocalMedia ?? false,
);
const mediaPath = tailResult.mediaUrl;
if (!mediaPath || !fs.existsSync(mediaPath)) {
throw new Error("expected final-mode tail TTS to write a local media file");
}
const ttsOnlyPayload = {
mediaUrl: tailResult.mediaUrl,
audioAsVoice: tailResult.audioAsVoice,
spokenText: accumulatedBlockText,
trustedLocalMedia: tailResult.trustedLocalMedia,
};
console.log(
"dispatch ttsOnlyPayload.trustedLocalMedia =",
ttsOnlyPayload.trustedLocalMedia ?? false,
);
const localRoots = [path.dirname(mediaPath)];
const trustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads([ttsOnlyPayload], {
localRoots,
});
const untrustedBlocks = await buildWebchatAudioContentBlocksFromReplyPayloads(
[{ mediaUrl: mediaPath }],
{ localRoots },
);
console.log(
"buildWebchatAudioContentBlocksFromReplyPayloads(trustedLocalMedia=true).length =",
"buildWebchatAudioContentBlocksFromReplyPayloads(ttsOnlyPayload).length =",
trustedBlocks.length,
);
console.log(
"buildWebchatAudioContentBlocksFromReplyPayloads(trustedLocalMedia missing).length =",
"buildWebchatAudioContentBlocksFromReplyPayloads(untrusted).length =",
untrustedBlocks.length,
);
if (blockResult.mediaUrl) {
fs.rmSync(path.dirname(blockResult.mediaUrl), { recursive: true, force: true });
}
fs.rmSync(path.dirname(mediaPath), { recursive: true, force: true });
try {
fs.unlinkSync(prefsPath);
} catch {

View File

@@ -153,6 +153,7 @@ const ttsMocks = vi.hoisted(() => {
...params.payload,
mediaUrl: "https://example.com/tts-synth.opus",
audioAsVoice: true,
trustedLocalMedia: true,
};
}
return params.payload;
@@ -2722,6 +2723,7 @@ describe("dispatchReplyFromConfig", () => {
expect(finalPayload?.mediaUrls).toStrictEqual(["/tmp/openclaw-media/normalized-tts.ogg"]);
expect(finalPayload?.audioAsVoice).toBe(true);
expect(finalPayload?.spokenText).toBe("Hello from block streaming.");
expect(finalPayload?.trustedLocalMedia).toBe(true);
});
it("closes oneshot ACP sessions after the turn completes", async () => {

View File

@@ -1700,6 +1700,7 @@ export async function dispatchReplyFromConfig(
mediaUrl: ttsSyntheticReply.mediaUrl,
audioAsVoice: ttsSyntheticReply.audioAsVoice,
spokenText: accumulatedBlockTtsText,
trustedLocalMedia: ttsSyntheticReply.trustedLocalMedia,
};
const normalizedTtsOnlyPayload = await normalizeReplyMediaPayload(ttsOnlyPayload);
const result = await routeReplyToOriginating(normalizedTtsOnlyPayload);