From 5b59079fd4c278b1a3eda1fa137ce5f9aeb41631 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 06:27:00 +0100 Subject: [PATCH] fix(tts): preserve audio-only hook transcript --- CHANGELOG.md | 1 + docs/plugins/hooks.md | 5 ++ src/auto-reply/reply-payload.ts | 5 ++ src/auto-reply/reply/commands-tts.ts | 1 + src/auto-reply/reply/dispatch-acp.ts | 1 + src/auto-reply/reply/dispatch-from-config.ts | 4 +- src/infra/outbound/deliver.test.ts | 62 ++++++++++++++++++++ src/infra/outbound/deliver.ts | 26 ++++++-- src/infra/outbound/payloads.test.ts | 36 ++++++++++++ src/infra/outbound/payloads.ts | 4 ++ 10 files changed, 138 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5c29a04be0..30646cbcea3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -82,6 +82,7 @@ Docs: https://docs.openclaw.ai - Discord/cron: deliver text-only isolated cron and heartbeat announce output from the canonical final assistant text once, avoiding duplicate Discord posts when streamed block payloads and the final answer contain the same content. Fixes #71406. Thanks @alexgross21. - macOS Gateway: wait for launchd to reload the exited Gateway LaunchAgent before bootstrapping repair fallback, preventing config-triggered restarts from leaving the service not loaded. Fixes #45178. Thanks @vincentkoc. +- TTS/hooks: preserve audio-only TTS transcripts for `message_sending` and `message_sent` hooks without rendering the transcript as a media caption. Thanks @zqchris. - Control UI/WebChat: hide heartbeat prompts, `HEARTBEAT_OK` acknowledgments, and internal-only runtime context turns from visible chat history while leaving the underlying transcript intact. Fixes #71381. Thanks @gerald1950ggg-ai. - Control UI/chat: keep optimistic user and assistant tail messages visible when a final history refresh briefly returns an older snapshot, preventing message cards from flash-disappearing until the next refresh. Fixes #71371. Thanks @WolvenRA. - Talk/TTS: resolve configured extension speech providers from the active runtime registry before provider-list discovery, so Talk mode no longer rejects valid plugin speech providers as unsupported. diff --git a/docs/plugins/hooks.md b/docs/plugins/hooks.md index 48a9aaa8a54..8475f42211d 100644 --- a/docs/plugins/hooks.md +++ b/docs/plugins/hooks.md @@ -190,6 +190,11 @@ Use message hooks for channel-level routing and delivery policy: - `message_sending`: rewrite `content` or return `{ cancel: true }`. - `message_sent`: observe final success or failure. +For audio-only TTS replies, `content` may contain the hidden spoken transcript +even when the channel payload has no visible text/caption. Rewriting that +`content` updates the hook-visible transcript only; it is not rendered as a +media caption. + Message hook contexts expose stable correlation fields when available: `ctx.sessionKey`, `ctx.runId`, `ctx.messageId`, `ctx.senderId`, `ctx.trace`, `ctx.traceId`, `ctx.spanId`, `ctx.parentSpanId`, and `ctx.callDepth`. Prefer diff --git a/src/auto-reply/reply-payload.ts b/src/auto-reply/reply-payload.ts index 0aacb840371..dbb34451e18 100644 --- a/src/auto-reply/reply-payload.ts +++ b/src/auto-reply/reply-payload.ts @@ -27,6 +27,11 @@ export type ReplyPayload = { replyToCurrent?: boolean; /** Send audio as voice message (bubble) instead of audio file. Defaults to false. */ audioAsVoice?: boolean; + /** + * Text synthesized into an audio-only TTS payload. Exposed to hooks for + * archival/search use when no visible channel text is sent. + */ + spokenText?: string; isError?: boolean; /** Marks this payload as a reasoning/thinking block. Channels that do not * have a dedicated reasoning lane (e.g. WhatsApp, web) should suppress it. */ diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index b6567857bd1..5dba5527f82 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -168,6 +168,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand mediaUrl: result.audioPath, audioAsVoice: result.voiceCompatible === true, trustedLocalMedia: true, + spokenText: args, }; return { shouldContinue: false, reply: payload }; } diff --git a/src/auto-reply/reply/dispatch-acp.ts b/src/auto-reply/reply/dispatch-acp.ts index 9db3987ddd1..b7dafe94a1f 100644 --- a/src/auto-reply/reply/dispatch-acp.ts +++ b/src/auto-reply/reply/dispatch-acp.ts @@ -213,6 +213,7 @@ async function finalizeAcpTurnOutput(params: { const delivered = await params.delivery.deliver("final", { mediaUrl: ttsSyntheticReply.mediaUrl, audioAsVoice: ttsSyntheticReply.audioAsVoice, + spokenText: accumulatedBlockText, }); queuedFinal = queuedFinal || delivered; finalMediaDelivered = delivered; diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index cb49054cae3..01d3ab2bd28 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -1149,10 +1149,12 @@ export async function dispatchReplyFromConfig( }); // Only send if TTS was actually applied (mediaUrl exists) if (ttsSyntheticReply.mediaUrl) { - // Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content + // Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content. + // Keep the spoken text only for hooks/archive consumers. const ttsOnlyPayload: ReplyPayload = { mediaUrl: ttsSyntheticReply.mediaUrl, audioAsVoice: ttsSyntheticReply.audioAsVoice, + spokenText: accumulatedBlockText, }; const result = await routeReplyToOriginating(ttsOnlyPayload); if (result) { diff --git a/src/infra/outbound/deliver.test.ts b/src/infra/outbound/deliver.test.ts index 0110441bddc..990539e656e 100644 --- a/src/infra/outbound/deliver.test.ts +++ b/src/infra/outbound/deliver.test.ts @@ -850,6 +850,68 @@ describe("deliverOutboundPayloads", () => { ); }); + it("exposes audio-only spokenText to hooks without rendering it as media caption", async () => { + hookMocks.runner.hasHooks.mockReturnValue(true); + hookMocks.runner.runMessageSending.mockResolvedValue({ + content: "rewritten hidden transcript", + }); + const sendMedia = vi.fn(async () => ({ + channel: "matrix" as const, + messageId: "mx-voice", + roomId: "!room:example", + })); + setActivePluginRegistry( + createTestRegistry([ + { + pluginId: "matrix", + source: "test", + plugin: createOutboundTestPlugin({ + id: "matrix", + outbound: { + deliveryMode: "direct", + sendText: vi.fn(), + sendMedia, + }, + }), + }, + ]), + ); + + await deliverOutboundPayloads({ + cfg: { channels: { matrix: {} } } as OpenClawConfig, + channel: "matrix", + to: "room:!room:example", + payloads: [ + { + mediaUrl: "file:///tmp/clip.opus", + audioAsVoice: true, + spokenText: "original hidden transcript", + }, + ], + }); + + expect(hookMocks.runner.runMessageSending).toHaveBeenCalledWith( + expect.objectContaining({ + content: "original hidden transcript", + }), + expect.objectContaining({ channelId: "matrix" }), + ); + expect(sendMedia).toHaveBeenCalledWith( + expect.objectContaining({ + text: "", + mediaUrl: "file:///tmp/clip.opus", + audioAsVoice: true, + }), + ); + expect(hookMocks.runner.runMessageSent).toHaveBeenCalledWith( + expect.objectContaining({ + content: "rewritten hidden transcript", + success: true, + }), + expect.objectContaining({ channelId: "matrix" }), + ); + }); + it("chunks plugin text and returns all results", async () => { const { sendMatrix, results } = await runChunkedMatrixDelivery(); diff --git a/src/infra/outbound/deliver.ts b/src/infra/outbound/deliver.ts index 7f1aeb8323e..5049d8afafe 100644 --- a/src/infra/outbound/deliver.ts +++ b/src/infra/outbound/deliver.ts @@ -620,7 +620,7 @@ async function applyMessageSendingHook(params: { const sendingResult = await params.hookRunner!.runMessageSending( { to: params.to, - content: params.payloadSummary.text, + content: params.payloadSummary.hookContent ?? params.payloadSummary.text, replyToId: params.replyToId ?? undefined, threadId: params.threadId ?? undefined, metadata: { @@ -649,6 +649,20 @@ async function applyMessageSendingHook(params: { payloadSummary: params.payloadSummary, }; } + if (params.payloadSummary.hookContent && !params.payloadSummary.text) { + const spokenText = sendingResult.content; + return { + cancelled: false, + payload: { + ...params.payload, + spokenText, + }, + payloadSummary: { + ...params.payloadSummary, + hookContent: spokenText, + }, + }; + } const payload = { ...params.payload, text: sendingResult.content, @@ -943,7 +957,7 @@ async function deliverOutboundPayloadsCore( }); emitMessageSent({ success: true, - content: payloadSummary.text, + content: payloadSummary.hookContent ?? payloadSummary.text, messageId: delivery.messageId, }); continue; @@ -977,7 +991,7 @@ async function deliverOutboundPayloadsCore( }); emitMessageSent({ success: results.length > beforeCount, - content: payloadSummary.text, + content: payloadSummary.hookContent ?? payloadSummary.text, messageId, }); continue; @@ -1017,7 +1031,7 @@ async function deliverOutboundPayloadsCore( }); emitMessageSent({ success: results.length > beforeCount, - content: payloadSummary.text, + content: payloadSummary.hookContent ?? payloadSummary.text, messageId, }); continue; @@ -1058,13 +1072,13 @@ async function deliverOutboundPayloadsCore( }); emitMessageSent({ success: true, - content: payloadSummary.text, + content: payloadSummary.hookContent ?? payloadSummary.text, messageId: lastMessageId, }); } catch (err) { emitMessageSent({ success: false, - content: payloadSummary.text, + content: payloadSummary.hookContent ?? payloadSummary.text, error: formatErrorMessage(err), }); if (!params.bestEffort) { diff --git a/src/infra/outbound/payloads.test.ts b/src/infra/outbound/payloads.test.ts index 2bb1cc862ab..cb0785ef187 100644 --- a/src/infra/outbound/payloads.test.ts +++ b/src/infra/outbound/payloads.test.ts @@ -13,6 +13,7 @@ import { projectOutboundPayloadPlanForJson, projectOutboundPayloadPlanForMirror, projectOutboundPayloadPlanForOutbound, + summarizeOutboundPayloadForTransport, } from "./payloads.js"; import { registerPendingSpawnedChildrenQuery } from "./pending-spawn-query.js"; @@ -676,3 +677,38 @@ describe("formatOutboundPayloadLog", () => { ).toBe(expected); }); }); + +describe("summarizeOutboundPayloadForTransport", () => { + it("keeps visible text as channel text and does not expose hook-only content", () => { + const summary = summarizeOutboundPayloadForTransport({ + text: "visible", + spokenText: "hidden transcript", + }); + + expect(summary.text).toBe("visible"); + expect(summary.hookContent).toBeUndefined(); + }); + + it("surfaces spokenText only as hook content for audio-only payloads", () => { + const summary = summarizeOutboundPayloadForTransport({ + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + spokenText: "Hi Ivy, good morning.", + }); + + expect(summary.text).toBe(""); + expect(summary.hookContent).toBe("Hi Ivy, good morning."); + expect(summary.mediaUrls).toEqual(["/tmp/reply.opus"]); + expect(summary.audioAsVoice).toBe(true); + }); + + it("ignores blank spokenText", () => { + const summary = summarizeOutboundPayloadForTransport({ + mediaUrl: "/tmp/reply.opus", + spokenText: " ", + }); + + expect(summary.text).toBe(""); + expect(summary.hookContent).toBeUndefined(); + }); +}); diff --git a/src/infra/outbound/payloads.ts b/src/infra/outbound/payloads.ts index fdfca6846c8..c9a0fed521b 100644 --- a/src/infra/outbound/payloads.ts +++ b/src/infra/outbound/payloads.ts @@ -31,6 +31,8 @@ export type NormalizedOutboundPayload = { delivery?: ReplyPayloadDelivery; interactive?: InteractiveReply; channelData?: Record; + /** Hook-only content for audio-only TTS payloads. Never used as channel text/caption. */ + hookContent?: string; }; export type OutboundPayloadJson = { @@ -333,6 +335,7 @@ export function summarizeOutboundPayloadForTransport( payload: ReplyPayload, ): NormalizedOutboundPayload { const parts = resolveSendableOutboundReplyParts(payload); + const spokenText = payload.spokenText?.trim() ? payload.spokenText : undefined; return { text: parts.text, mediaUrls: parts.mediaUrls, @@ -341,6 +344,7 @@ export function summarizeOutboundPayloadForTransport( delivery: payload.delivery, interactive: payload.interactive, channelData: payload.channelData, + ...(parts.text || !spokenText ? {} : { hookContent: spokenText }), }; }