fix(agents): deliver agent TTS audio when block streaming is off (#78355)

Summary:
- The branch changes non-streaming block reply delivery to direct-send all media-bearing block replies, updates reply-delivery/media-path regression tests, and adds a changelog entry.
- Reproducibility: yes. Current main's predicate and unit test show captioned media-bearing block replies are  ... sent when block streaming is disabled, and the PR body adds real Telegram after-fix proof for the TTS path.

Automerge notes:
- PR branch already contained follow-up commit before automerge: test(agents): align direct media block delivery coverage

Validation:
- ClawSweeper review passed for head e9bb1314fe.
- Required merge gates passed before the squash merge.

Prepared head SHA: e9bb1314fe
Review: https://github.com/openclaw/openclaw/pull/78355#issuecomment-4386200162

Co-authored-by: Clawdbot <clawdbot@apilab.us>
Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Conan-Scott
2026-05-06 19:37:22 +10:00
committed by GitHub
parent ffafa9008d
commit e437763246
4 changed files with 53 additions and 8 deletions

View File

@@ -142,6 +142,7 @@ Docs: https://docs.openclaw.ai
- Agents/context engines: keep hidden OpenClaw runtime-context custom messages out of context-engine assemble, afterTurn, and ingest hooks so transcript reconstruction plugins only see conversation messages. Thanks @vincentkoc.
- Network/runtime: avoid importing Undici's package dispatcher during no-proxy timeout bootstrap so external channel plugin fetch requests with explicit Content-Length keep working. Fixes #78007. Thanks @shakkernerd.
- Gateway/shutdown: cancel delayed post-ready maintenance during close and suppress maintenance/cron startup after quick restarts, preventing orphaned background timers. Thanks @vincentkoc.
- Agents/TTS: send media-bearing block replies directly when block streaming is off, so agent `tts` tool audio attached to a final text reply is delivered instead of being consumed before final Telegram/media delivery. Thanks @Conan-Scott.
- Agents/generated media: treat attachment-style message tool actions as completed chat sends, preventing duplicate fallback media posts when generated files were already uploaded.
- Control UI/sessions: show each session's agent runtime in the Sessions table and allow filtering by runtime labels, matching the Agents panel runtime wording. Thanks @vincentkoc.
- Discord/streaming: show live reasoning text in progress drafts instead of a bare `Reasoning` status line.

View File

@@ -268,7 +268,8 @@ describe("runReplyAgent media path normalization", () => {
}),
);
expect(result).toMatchObject({
expect(result).toBeUndefined();
expect(onBlockReply).toHaveBeenCalledWith({
text: "here is the chart",
mediaUrl: "/tmp/outbound-media/1-chart.png",
mediaUrls: ["/tmp/outbound-media/1-chart.png"],
@@ -277,7 +278,6 @@ describe("runReplyAgent media path normalization", () => {
audioAsVoice: false,
});
expect(resolveOutboundAttachmentFromUrlMock).toHaveBeenCalledTimes(1);
expect(onBlockReply).not.toHaveBeenCalled();
});
it("does not create a second media context inside runAgentTurnWithFallback when onBlockReply is provided", async () => {

View File

@@ -13,7 +13,7 @@ type BlockReplyPipelineLike = NonNullable<
>;
describe("createBlockReplyDeliveryHandler", () => {
it("keeps captioned media-bearing block replies buffered when block streaming is disabled", async () => {
it("sends captioned media-bearing block replies when block streaming is disabled", async () => {
const onBlockReply = vi.fn(async () => {});
const normalizeStreamingText = vi.fn((payload: { text?: string }) => ({
text: payload.text,
@@ -40,11 +40,57 @@ describe("createBlockReplyDeliveryHandler", () => {
replyToCurrent: true,
});
expect(onBlockReply).not.toHaveBeenCalled();
expect(directlySentBlockKeys).toEqual(new Set());
const expectedPayload = {
text: "here's the vibe",
mediaUrl: "/tmp/generated.png",
mediaUrls: ["/tmp/generated.png"],
replyToCurrent: true,
replyToId: undefined,
replyToTag: undefined,
audioAsVoice: false,
};
expect(onBlockReply).toHaveBeenCalledWith(expectedPayload);
expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)]));
expect(typingSignals.signalTextDelta).toHaveBeenCalledWith("here's the vibe");
});
it("sends captioned audio-as-voice block replies when block streaming is disabled", async () => {
const onBlockReply = vi.fn(async () => {});
const directlySentBlockKeys = new Set<string>();
const handler = createBlockReplyDeliveryHandler({
onBlockReply,
normalizeStreamingText: (payload) => ({ text: payload.text, skip: false }),
applyReplyToMode: (payload) => payload,
typingSignals: {
signalTextDelta: vi.fn(async () => {}),
} as unknown as TypingSignaler,
blockStreamingEnabled: false,
blockReplyPipeline: null,
directlySentBlockKeys,
});
await handler({
text: "spoken confirmation",
mediaUrls: ["/tmp/voice.opus"],
audioAsVoice: true,
});
const expectedPayload = {
text: "spoken confirmation",
mediaUrl: "/tmp/voice.opus",
mediaUrls: ["/tmp/voice.opus"],
replyToId: undefined,
replyToCurrent: undefined,
replyToTag: undefined,
audioAsVoice: true,
};
expect(onBlockReply).toHaveBeenCalledWith(expectedPayload);
expect(directlySentBlockKeys).toEqual(new Set([createBlockReplyContentKey(expectedPayload)]));
});
it("sends media-only block replies when block streaming is disabled", async () => {
const onBlockReply = vi.fn(async () => {});
const directlySentBlockKeys = new Set<string>();

View File

@@ -157,9 +157,7 @@ export function createBlockReplyDeliveryHandler(params: {
trackingPayload: blockPayload,
payload: blockPayload,
});
} else if (blockHasMedia && !blockPayload.text) {
// Media-only block replies (for example orphaned tool attachments) are not reconstructible
// from the assistant's final text, so they still need a direct fallback when streaming is off.
} else if (blockHasMedia) {
await sendDirectBlockReply({
onBlockReply: params.onBlockReply,
directlySentBlockKeys: params.directlySentBlockKeys,