From e437763246ded44704de92bacf268d654d590ef1 Mon Sep 17 00:00:00 2001 From: Conan-Scott Date: Wed, 6 May 2026 19:37:22 +1000 Subject: [PATCH] fix(agents): deliver agent TTS audio when block streaming is off (#78355) Summary: - The branch changes non-streaming block reply delivery to direct-send all media-bearing block replies, updates reply-delivery/media-path regression tests, and adds a changelog entry. - Reproducibility: yes. Current main's predicate and unit test show captioned media-bearing block replies are ... sent when block streaming is disabled, and the PR body adds real Telegram after-fix proof for the TTS path. Automerge notes: - PR branch already contained follow-up commit before automerge: test(agents): align direct media block delivery coverage Validation: - ClawSweeper review passed for head e9bb1314fe9104e0ec61090c46e7edecf24499ca. - Required merge gates passed before the squash merge. Prepared head SHA: e9bb1314fe9104e0ec61090c46e7edecf24499ca Review: https://github.com/openclaw/openclaw/pull/78355#issuecomment-4386200162 Co-authored-by: Clawdbot Co-authored-by: Ayaan Zaidi --- CHANGELOG.md | 1 + .../reply/agent-runner.media-paths.test.ts | 4 +- src/auto-reply/reply/reply-delivery.test.ts | 52 +++++++++++++++++-- src/auto-reply/reply/reply-delivery.ts | 4 +- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a96805faa58..9fdde3d670c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -142,6 +142,7 @@ Docs: https://docs.openclaw.ai - Agents/context engines: keep hidden OpenClaw runtime-context custom messages out of context-engine assemble, afterTurn, and ingest hooks so transcript reconstruction plugins only see conversation messages. Thanks @vincentkoc. - Network/runtime: avoid importing Undici's package dispatcher during no-proxy timeout bootstrap so external channel plugin fetch requests with explicit Content-Length keep working. Fixes #78007. Thanks @shakkernerd. - Gateway/shutdown: cancel delayed post-ready maintenance during close and suppress maintenance/cron startup after quick restarts, preventing orphaned background timers. Thanks @vincentkoc. +- Agents/TTS: send media-bearing block replies directly when block streaming is off, so agent `tts` tool audio attached to a final text reply is delivered instead of being consumed before final Telegram/media delivery. Thanks @Conan-Scott. - Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded. - Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc. - Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line. diff --git a/src/auto-reply/reply/agent-runner.media-paths.test.ts b/src/auto-reply/reply/agent-runner.media-paths.test.ts index acfde87d4bc..3f93e1454c2 100644 --- a/src/auto-reply/reply/agent-runner.media-paths.test.ts +++ b/src/auto-reply/reply/agent-runner.media-paths.test.ts @@ -268,7 +268,8 @@ describe("runReplyAgent media path normalization", () => { }), ); - expect(result).toMatchObject({ + expect(result).toBeUndefined(); + expect(onBlockReply).toHaveBeenCalledWith({ text: "here is the chart", mediaUrl: "/tmp/outbound-media/1-chart.png", mediaUrls: ["/tmp/outbound-media/1-chart.png"], @@ -277,7 +278,6 @@ describe("runReplyAgent media path normalization", () => { audioAsVoice: false, }); expect(resolveOutboundAttachmentFromUrlMock).toHaveBeenCalledTimes(1); - expect(onBlockReply).not.toHaveBeenCalled(); }); it("does not create a second media context inside runAgentTurnWithFallback when onBlockReply is provided", async () => { diff --git a/src/auto-reply/reply/reply-delivery.test.ts b/src/auto-reply/reply/reply-delivery.test.ts index f9811a15855..01b3ff9cd3f 100644 --- a/src/auto-reply/reply/reply-delivery.test.ts +++ b/src/auto-reply/reply/reply-delivery.test.ts @@ -13,7 +13,7 @@ type BlockReplyPipelineLike = NonNullable< >; describe("createBlockReplyDeliveryHandler", () => { - it("keeps captioned media-bearing block replies buffered when block streaming is disabled", async () => { + it("sends captioned media-bearing block replies when block streaming is disabled", async () => { const onBlockReply = vi.fn(async () => {}); const normalizeStreamingText = vi.fn((payload: { text?: string }) => ({ text: payload.text, @@ -40,11 +40,57 @@ describe("createBlockReplyDeliveryHandler", () => { replyToCurrent: true, }); - expect(onBlockReply).not.toHaveBeenCalled(); - expect(directlySentBlockKeys).toEqual(new Set()); + const expectedPayload = { + text: "here's the vibe", + mediaUrl: "/tmp/generated.png", + mediaUrls: ["/tmp/generated.png"], + replyToCurrent: true, + replyToId: undefined, + replyToTag: undefined, + audioAsVoice: false, + }; + + expect(onBlockReply).toHaveBeenCalledWith(expectedPayload); + expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)])); expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe"); }); + it("sends captioned audio-as-voice block replies when block streaming is disabled", async () => { + const onBlockReply = vi.fn(async () => {}); + const directlySentBlockKeys = new Set(); + + const handler = createBlockReplyDeliveryHandler({ + onBlockReply, + normalizeStreamingText: (payload) => ({ text: payload.text, skip: false }), + applyReplyToMode: (payload) => payload, + typingSignals: { + signalTextDelta: vi.fn(async () => {}), + } as unknown as TypingSignaler, + blockStreamingEnabled: false, + blockReplyPipeline: null, + directlySentBlockKeys, + }); + + await handler({ + text: "spoken confirmation", + mediaUrls: ["/tmp/voice.opus"], + audioAsVoice: true, + }); + + const expectedPayload = { + text: "spoken confirmation", + mediaUrl: "/tmp/voice.opus", + mediaUrls: ["/tmp/voice.opus"], + replyToId: undefined, + replyToCurrent: undefined, + replyToTag: undefined, + audioAsVoice: true, + }; + + expect(onBlockReply).toHaveBeenCalledWith(expectedPayload); + expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)])); + }); + it("sends media-only block replies when block streaming is disabled", async () => { const onBlockReply = vi.fn(async () => {}); const directlySentBlockKeys = new Set(); diff --git a/src/auto-reply/reply/reply-delivery.ts b/src/auto-reply/reply/reply-delivery.ts index d3c92bc5cdf..1df5b8322fb 100644 --- a/src/auto-reply/reply/reply-delivery.ts +++ b/src/auto-reply/reply/reply-delivery.ts @@ -157,9 +157,7 @@ export function createBlockReplyDeliveryHandler(params: { trackingPayload: blockPayload, payload: blockPayload, }); - } else if (blockHasMedia && !blockPayload.text) { - // Media-only block replies (for example orphaned tool attachments) are not reconstructible - // from the assistant's final text, so they still need a direct fallback when streaming is off. + } else if (blockHasMedia) { await sendDirectBlockReply({ onBlockReply: params.onBlockReply, directlySentBlockKeys: params.directlySentBlockKeys,