fix(agents): deliver agent TTS audio when block streaming is off (#78355)

Summary: - The branch changes non-streaming block reply delivery to direct-send all media-bearing block replies, updates reply-delivery/media-path regression tests, and adds a changelog entry. - Reproducibility: yes. Current main's predicate and unit test show captioned media-bearing block replies are ... sent when block streaming is disabled, and the PR body adds real Telegram after-fix proof for the TTS path. Automerge notes: - PR branch already contained follow-up commit before automerge: test(agents): align direct media block delivery coverage Validation: - ClawSweeper review passed for head e9bb1314fe. - Required merge gates passed before the squash merge. Prepared head SHA: e9bb1314fe Review: https://github.com/openclaw/openclaw/pull/78355#issuecomment-4386200162 Co-authored-by: Clawdbot <clawdbot@apilab.us> Co-authored-by: Ayaan Zaidi <hi@obviy.us>
2026-05-06 18:50:42 +00:00 · 2026-05-06 19:37:22 +10:00
parent ffafa9008d
commit e437763246
4 changed files with 53 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -142,6 +142,7 @@ Docs: https://docs.openclaw.ai
 - Agents/context engines: keep hidden OpenClaw runtime-context custom messages out of context-engine assemble, afterTurn, and ingest hooks so transcript reconstruction plugins only see conversation messages. Thanks @vincentkoc.
 - Network/runtime: avoid importing Undici's package dispatcher during no-proxy timeout bootstrap so external channel plugin fetch requests with explicit Content-Length keep working. Fixes #78007. Thanks @shakkernerd.
 - Gateway/shutdown: cancel delayed post-ready maintenance during close and suppress maintenance/cron startup after quick restarts, preventing orphaned background timers. Thanks @vincentkoc.
+- Agents/TTS: send media-bearing block replies directly when block streaming is off, so agent `tts` tool audio attached to a final text reply is delivered instead of being consumed before final Telegram/media delivery. Thanks @Conan-Scott.
 - Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded.
 - Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc.
 - Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line.
--- a/src/auto-reply/reply/agent-runner.media-paths.test.ts
+++ b/src/auto-reply/reply/agent-runner.media-paths.test.ts
@@ -268,7 +268,8 @@ describe("runReplyAgent media path normalization", () => {
      }),
    );

-    expect(result).toMatchObject({
+    expect(result).toBeUndefined();
+    expect(onBlockReply).toHaveBeenCalledWith({
      text: "here is the chart",
      mediaUrl: "/tmp/outbound-media/1-chart.png",
      mediaUrls: ["/tmp/outbound-media/1-chart.png"],
@@ -277,7 +278,6 @@ describe("runReplyAgent media path normalization", () => {
      audioAsVoice: false,
    });
    expect(resolveOutboundAttachmentFromUrlMock).toHaveBeenCalledTimes(1);
-    expect(onBlockReply).not.toHaveBeenCalled();
  });

  it("does not create a second media context inside runAgentTurnWithFallback when onBlockReply is provided", async () => {
--- a/src/auto-reply/reply/reply-delivery.test.ts
+++ b/src/auto-reply/reply/reply-delivery.test.ts
@@ -13,7 +13,7 @@ type BlockReplyPipelineLike = NonNullable<
 >;

 describe("createBlockReplyDeliveryHandler", () => {
-  it("keeps captioned media-bearing block replies buffered when block streaming is disabled", async () => {
+  it("sends captioned media-bearing block replies when block streaming is disabled", async () => {
    const onBlockReply = vi.fn(async () => {});
    const normalizeStreamingText = vi.fn((payload: { text?: string }) => ({
      text: payload.text,
@@ -40,11 +40,57 @@ describe("createBlockReplyDeliveryHandler", () => {
      replyToCurrent: true,
    });

-    expect(onBlockReply).not.toHaveBeenCalled();
-    expect(directlySentBlockKeys).toEqual(new Set());
+    const expectedPayload = {
+      text: "here's the vibe",
+      mediaUrl: "/tmp/generated.png",
+      mediaUrls: ["/tmp/generated.png"],
+      replyToCurrent: true,
+      replyToId: undefined,
+      replyToTag: undefined,
+      audioAsVoice: false,
+    };
+
+    expect(onBlockReply).toHaveBeenCalledWith(expectedPayload);
+    expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)]));
    expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe");
  });

+  it("sends captioned audio-as-voice block replies when block streaming is disabled", async () => {
+    const onBlockReply = vi.fn(async () => {});
+    const directlySentBlockKeys = new Set<string>();
+
+    const handler = createBlockReplyDeliveryHandler({
+      onBlockReply,
+      normalizeStreamingText: (payload) => ({ text: payload.text, skip: false }),
+      applyReplyToMode: (payload) => payload,
+      typingSignals: {
+        signalTextDelta: vi.fn(async () => {}),
+      } as unknown as TypingSignaler,
+      blockStreamingEnabled: false,
+      blockReplyPipeline: null,
+      directlySentBlockKeys,
+    });
+
+    await handler({
+      text: "spoken confirmation",
+      mediaUrls: ["/tmp/voice.opus"],
+      audioAsVoice: true,
+    });
+
+    const expectedPayload = {
+      text: "spoken confirmation",
+      mediaUrl: "/tmp/voice.opus",
+      mediaUrls: ["/tmp/voice.opus"],
+      replyToId: undefined,
+      replyToCurrent: undefined,
+      replyToTag: undefined,
+      audioAsVoice: true,
+    };
+
+    expect(onBlockReply).toHaveBeenCalledWith(expectedPayload);
+    expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)]));
+  });
+
  it("sends media-only block replies when block streaming is disabled", async () => {
    const onBlockReply = vi.fn(async () => {});
    const directlySentBlockKeys = new Set<string>();
--- a/src/auto-reply/reply/reply-delivery.ts
+++ b/src/auto-reply/reply/reply-delivery.ts
@@ -157,9 +157,7 @@ export function createBlockReplyDeliveryHandler(params: {
        trackingPayload: blockPayload,
        payload: blockPayload,
      });
-    } else if (blockHasMedia && !blockPayload.text) {
-      // Media-only block replies (for example orphaned tool attachments) are not reconstructible
-      // from the assistant's final text, so they still need a direct fallback when streaming is off.
+    } else if (blockHasMedia) {
      await sendDirectBlockReply({
        onBlockReply: params.onBlockReply,
        directlySentBlockKeys: params.directlySentBlockKeys,