From 39343088edb8fe4e5d6e354539e12ff7968cb8ce Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 18:04:44 +0100 Subject: [PATCH] fix(tts): keep media-only no-reply payloads --- CHANGELOG.md | 3 ++ docs/concepts/messages.md | 2 ++ .../attempt.spawn-workspace.test-support.ts | 1 + src/agents/pi-embedded-runner/run/attempt.ts | 4 +++ ...edded-subscribe.handlers.lifecycle.test.ts | 21 ++++++++++++-- ...i-embedded-subscribe.handlers.lifecycle.ts | 8 ++++-- ...bedded-subscribe.handlers.messages.test.ts | 16 +++++++++++ ...pi-embedded-subscribe.handlers.messages.ts | 18 ++++++++++-- ...session.subscribeembeddedpisession.test.ts | 28 +++++++++++++++++++ src/agents/pi-embedded-subscribe.ts | 2 ++ 10 files changed, 94 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a6c822f798..48f3a1066d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ Docs: https://docs.openclaw.ai - Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535) Thanks @azade-c. +- Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on + non-block delivery paths, so media-only generated audio replies still send. + (#60025) Thanks @bradlind1. - Telegram/STT: frame inbound voice-note transcripts as machine-generated, untrusted text in agent context while preserving raw transcript mention detection. Closes #33360. Thanks @smartchainark. diff --git a/docs/concepts/messages.md b/docs/concepts/messages.md index 98279ae79a7..1b38872bc17 100644 --- a/docs/concepts/messages.md +++ b/docs/concepts/messages.md @@ -154,6 +154,8 @@ Details: [Configuration](/gateway/config-agents#messages) and channel docs. ## Silent replies The exact silent token `NO_REPLY` / `no_reply` means “do not deliver a user-visible reply”. +When a turn also has pending tool media, such as generated TTS audio, OpenClaw +strips the silent text but still delivers the media attachment. OpenClaw resolves that behavior by conversation type: - Direct conversations disallow silence by default and rewrite a bare silent diff --git a/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.test-support.ts b/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.test-support.ts index 62f03bfc539..a6fccb82aed 100644 --- a/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.test-support.ts +++ b/src/agents/pi-embedded-runner/run/attempt.spawn-workspace.test-support.ts @@ -90,6 +90,7 @@ export function createSubscriptionMock(): SubscriptionMock { getMessagingToolSentTexts: () => [] as string[], getMessagingToolSentMediaUrls: () => [] as string[], getMessagingToolSentTargets: () => [] as MessagingToolSend[], + getPendingToolMediaReply: () => null, getSuccessfulCronAdds: () => 0, getReplayState: () => ({ replayInvalid: false, diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 76191d3f705..51e863cff22 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -2044,6 +2044,7 @@ export async function runEmbeddedAttempt( getMessagingToolSentTexts, getMessagingToolSentMediaUrls, getMessagingToolSentTargets, + getPendingToolMediaReply, getSuccessfulCronAdds, getReplayState, didSendViaMessagingTool, @@ -2994,6 +2995,7 @@ export async function runEmbeddedAttempt( messagingToolSentMediaUrls: getMessagingToolSentMediaUrls(), successfulCronAdds: getSuccessfulCronAdds(), }); + const pendingToolMediaReply = getPendingToolMediaReply(); const replayMetadata = replayMetadataFromState( observeReplayMetadata(getReplayState(), observedReplayMetadata), ); @@ -3077,6 +3079,8 @@ export async function runEmbeddedAttempt( messagingToolSentTexts: getMessagingToolSentTexts(), messagingToolSentMediaUrls: getMessagingToolSentMediaUrls(), messagingToolSentTargets: getMessagingToolSentTargets(), + toolMediaUrls: pendingToolMediaReply?.mediaUrls, + toolAudioAsVoice: pendingToolMediaReply?.audioAsVoice, successfulCronAdds: getSuccessfulCronAdds(), cloudCodeAssistFormatError: Boolean( lastAssistant?.errorMessage && isCloudCodeAssistFormatError(lastAssistant.errorMessage), diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts index 75a2ee1e06d..a27fde95bfd 100644 --- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts @@ -12,10 +12,13 @@ function createContext( overrides?: { onAgentEvent?: (event: unknown) => void; onBeforeLifecycleTerminal?: () => void | Promise; + onBlockReply?: ((payload: unknown) => void) | undefined; onBlockReplyFlush?: () => void | Promise; }, ): EmbeddedPiSubscribeContext { - const onBlockReply = vi.fn(); + const hasOnBlockReplyOverride = Boolean(overrides && "onBlockReply" in overrides); + const onBlockReply = hasOnBlockReplyOverride ? overrides?.onBlockReply : vi.fn(); + const emitBlockReply = vi.fn(); return { params: { runId: "run-1", @@ -23,7 +26,7 @@ function createContext( sessionKey: "agent:main:main", onAgentEvent: overrides?.onAgentEvent, onBeforeLifecycleTerminal: overrides?.onBeforeLifecycleTerminal, - onBlockReply, + ...(onBlockReply ? { onBlockReply } : {}), onBlockReplyFlush: overrides?.onBlockReplyFlush, }, state: { @@ -43,7 +46,7 @@ function createContext( warn: vi.fn(), }, flushBlockReplyBuffer: vi.fn(), - emitBlockReply: onBlockReply, + emitBlockReply, resolveCompactionRetry: vi.fn(), maybeResolveCompactionWait: vi.fn(), } as unknown as EmbeddedPiSubscribeContext; @@ -321,6 +324,18 @@ describe("handleAgentEnd", () => { expect(ctx.state.pendingToolAudioAsVoice).toBe(false); }); + it("preserves orphaned tool media when no block reply callback is configured", async () => { + const ctx = createContext(undefined, { onBlockReply: undefined }); + ctx.state.pendingToolMediaUrls = ["/tmp/reply.opus"]; + ctx.state.pendingToolAudioAsVoice = true; + + await handleAgentEnd(ctx); + + expect(ctx.emitBlockReply).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); + }); + it("emits orphaned tool media before the lifecycle end event", async () => { const onAgentEvent = vi.fn(); const ctx = createContext(undefined, { onAgentEvent }); diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts index b81e42ea8d1..d497526cc86 100644 --- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts +++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts @@ -167,9 +167,11 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext): void | Promise< }; const flushPendingMediaAndChannel = () => { - const pendingToolMediaReply = consumePendingToolMediaReply(ctx.state); - if (pendingToolMediaReply && hasAssistantVisibleReply(pendingToolMediaReply)) { - ctx.emitBlockReply(pendingToolMediaReply); + if (ctx.params.onBlockReply) { + const pendingToolMediaReply = consumePendingToolMediaReply(ctx.state); + if (pendingToolMediaReply && hasAssistantVisibleReply(pendingToolMediaReply)) { + ctx.emitBlockReply(pendingToolMediaReply); + } } const postMediaFlushResult = ctx.flushBlockReplyBuffer(); diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts index a79a2bff95c..54a452fedfd 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.test.ts @@ -9,6 +9,7 @@ import { handleMessageEnd, handleMessageUpdate, hasAssistantVisibleReply, + readPendingToolMediaReply, recordPendingAssistantReplyDirectives, resolveSilentReplyFallbackText, } from "./pi-embedded-subscribe.handlers.messages.js"; @@ -394,6 +395,21 @@ describe("consumePendingToolMediaIntoReply", () => { }); describe("consumePendingToolMediaReply", () => { + it("reads a media-only reply without consuming queued tool media", () => { + const state = { + pendingToolMediaUrls: ["/tmp/reply.opus"], + pendingToolAudioAsVoice: true, + pendingToolTrustedLocalMedia: false, + }; + + expect(readPendingToolMediaReply(state)).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + expect(state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(state.pendingToolAudioAsVoice).toBe(true); + }); + it("builds a media-only reply for orphaned tool media", () => { const state = { pendingToolMediaUrls: ["/tmp/reply.opus"], diff --git a/src/agents/pi-embedded-subscribe.handlers.messages.ts b/src/agents/pi-embedded-subscribe.handlers.messages.ts index a536a6b1fc5..96261305c74 100644 --- a/src/agents/pi-embedded-subscribe.handlers.messages.ts +++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts @@ -212,6 +212,20 @@ export function consumePendingToolMediaReply( EmbeddedPiSubscribeState, "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia" >, +): BlockReplyPayload | null { + const payload = readPendingToolMediaReply(state); + if (!payload) { + return null; + } + clearPendingToolMedia(state); + return payload; +} + +export function readPendingToolMediaReply( + state: Pick< + EmbeddedPiSubscribeState, + "pendingToolMediaUrls" | "pendingToolAudioAsVoice" | "pendingToolTrustedLocalMedia" + >, ): BlockReplyPayload | null { if ( state.pendingToolMediaUrls.length === 0 && @@ -220,15 +234,13 @@ export function consumePendingToolMediaReply( ) { return null; } - const payload: BlockReplyPayload = { + return { mediaUrls: state.pendingToolMediaUrls.length ? Array.from(new Set(state.pendingToolMediaUrls)) : undefined, audioAsVoice: state.pendingToolAudioAsVoice || undefined, trustedLocalMedia: state.pendingToolTrustedLocalMedia || undefined, }; - clearPendingToolMedia(state); - return payload; } function hasReplyDirectiveMetadata(parsed: ReplyDirectiveParseResult | null | undefined): boolean { diff --git a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts index d89d8e0eec5..9bfdd229bb8 100644 --- a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts +++ b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.subscribeembeddedpisession.test.ts @@ -414,6 +414,34 @@ describe("subscribeEmbeddedPiSession", () => { ); }); + it("keeps orphaned tool media available for non-block final payload assembly", () => { + const { emit, subscription } = createSubscribedSessionHarness({ + runId: "run", + builtinToolNames: new Set(["tts"]), + }); + + emit({ + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }, + }); + emit({ type: "agent_end" }); + + expect(subscription.getPendingToolMediaReply()).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + }); + it.each(THINKING_TAG_CASES)( "suppresses <%s> blocks across chunk boundaries", async ({ open, close }) => { diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index e074dc15671..e257abf58b5 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -24,6 +24,7 @@ import { createEmbeddedPiSessionEventHandler } from "./pi-embedded-subscribe.han import { consumePendingAssistantReplyDirectivesIntoReply, consumePendingToolMediaIntoReply, + readPendingToolMediaReply, } from "./pi-embedded-subscribe.handlers.messages.js"; import type { EmbeddedPiSubscribeContext, @@ -866,6 +867,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar getMessagingToolSentTexts: () => messagingToolSentTexts.slice(), getMessagingToolSentMediaUrls: () => messagingToolSentMediaUrls.slice(), getMessagingToolSentTargets: () => messagingToolSentTargets.slice(), + getPendingToolMediaReply: () => readPendingToolMediaReply(state), getSuccessfulCronAdds: () => state.successfulCronAdds, getReplayState: () => ({ ...state.replayState }), // Returns true if any messaging tool successfully sent a message.