From f83dbbc926f2eaa6e59dbae9051ca8f627d3d517 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 10 May 2026 05:00:26 +0100 Subject: [PATCH] fix(discord): prevent realtime answer replacement --- CHANGELOG.md | 1 + docs/channels/discord.md | 8 +- .../discord/src/voice/manager.e2e.test.ts | 99 +++++++++++++++- extensions/discord/src/voice/realtime.ts | 112 +++++++++++++++--- 4 files changed, 198 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50b8142cd1a..c60e2429241 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Discord/voice: keep default agent-proxy realtime sessions from auto-speaking filler before the forced OpenClaw consult answer, finish Discord playback on realtime response completion, and queue later exact-speech answers until playback idles to avoid mid-sentence replacement. - OpenAI/realtime voice: honor disabled input-audio interruption locally so server VAD speech-start events do not clear Discord playback after operators set `interruptResponseOnInputAudio: false`. - Telegram: handle managed select button callbacks before the raw callback fallback while preserving delimiter-containing option values such as `env|prod`. (#79816) Thanks @moeedahmed. - CLI/media: let explicit image description model refs use bundled static provider catalogs and generic model-backed image hooks, so `openclaw infer image describe --model zai/glm-4.6v` works like direct model runs and Anthropic auth probes avoid stale Claude 3 Haiku catalog entries. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index 866551e7bc5..603e3b589f9 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1200,7 +1200,7 @@ Notes: - `voice.mode` controls the conversation path. The default is `agent-proxy`: a realtime voice front end handles turn timing, interruption, and playback, delegates substantive work to the routed OpenClaw agent through `openclaw_agent_consult`, and treats the result like a typed Discord prompt from that speaker. `stt-tts` keeps the older batch STT plus TTS flow. `bidi` lets the realtime model converse directly while exposing `openclaw_agent_consult` for the OpenClaw brain. - `voice.agentSession` controls which OpenClaw conversation receives voice turns. Leave it unset for the voice channel's own session, or set `{ mode: "target", target: "channel:" }` to make the voice channel act as the microphone/speaker extension of an existing Discord text channel session such as `#maintainers`. - `voice.model` overrides the OpenClaw agent brain for Discord voice responses and realtime consults. Leave it unset to inherit the routed agent model. It is separate from `voice.realtime.model`. -- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). +- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). In that default `always` mode, the realtime layer does not auto-speak filler before the consult answer; it captures and transcribes speech, then speaks the routed OpenClaw answer. If multiple forced consult answers finish while Discord is still playing the first answer, later exact-speech answers are queued until playback idles instead of replacing speech mid-sentence. - In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription. - In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`. - `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting. @@ -1368,11 +1368,13 @@ Use this when the model hears its own Discord playback through an open mic, but Expected voice logs: - On join: `discord voice: joining ... voiceSession=... supervisorSession=... agentSessionMode=... voiceModel=... realtimeModel=...` -- On realtime start: `discord voice: realtime bridge starting ... interruptResponse=false bargeIn=true minBargeInAudioEndMs=...` +- On realtime start: `discord voice: realtime bridge starting ... autoRespond=false interruptResponse=false bargeIn=false minBargeInAudioEndMs=...` - On speaker audio: `discord voice: realtime speaker turn opened ...`, `discord voice: realtime input audio started ... outputAudioMs=... outputActive=...`, and `discord voice: realtime speaker turn closed ... chunks=... discordBytes=... realtimeBytes=... interruptedPlayback=...` +- On realtime response completion: `discord voice: realtime audio playback finishing reason=response.done ... audioMs=... chunks=...` - On playback stop/reset: `discord voice: realtime audio playback stopped reason=... audioMs=... elapsedMs=... chunks=...` - On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...` - On agent answer: `discord voice: agent turn answer ...` +- On queued exact speech: `discord voice: realtime exact speech queued ... queued=... outputAudioMs=... outputActive=...`, followed by `discord voice: realtime exact speech dequeued reason=player-idle ...` - On barge-in detection: `discord voice: realtime barge-in detected source=speaker-start ...` or `discord voice: realtime barge-in detected source=active-speaker-audio ...`, followed by `discord voice: realtime barge-in requested reason=... outputAudioMs=... outputActive=...` - On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...` - On ignored echo/noise: `discord voice: realtime model interrupt ignored client:conversation.item.truncate.skipped reason=barge-in audioEndMs=0 minAudioEndMs=250` @@ -1386,7 +1388,7 @@ To debug cut-off audio, read the realtime voice logs as a timeline: 3. `realtime input audio started` marks the first actual audio frame received for that speaker turn. `outputActive=true` or a nonzero `outputAudioMs` here means the mic is sending input while assistant playback is still active. 4. `barge-in detected source=active-speaker-audio` means OpenClaw saw live speaker audio while assistant playback was active. This is useful for distinguishing a real interruption from a Discord speaker-start event with no useful audio. 5. `barge-in requested reason=...` means OpenClaw asked the realtime provider to cancel or truncate the active response. It includes `outputAudioMs`, `outputActive`, and `playbackChunks` so you can see how much assistant audio had actually played before the interruption. -6. `realtime audio playback stopped reason=...` is the local Discord playback reset point. The reason says who stopped playback: `barge-in`, `player-idle`, `provider-clear-audio`, `forced-agent-consult`, `forced-agent-consult-answer`, `stream-close`, or `session-close`. +6. `realtime audio playback stopped reason=...` is the local Discord playback reset point. The reason says who stopped playback: `barge-in`, `player-idle`, `provider-clear-audio`, `forced-agent-consult`, `stream-close`, or `session-close`. 7. `realtime speaker turn closed` summarizes the captured input turn. `chunks=0` or `hasAudio=false` means the speaker turn opened but no usable audio reached the realtime bridge. `interruptedPlayback=true` means that input turn overlapped assistant output and triggered barge-in logic. Useful fields: diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts index 661f6cf8c85..dab1d7b2222 100644 --- a/extensions/discord/src/voice/manager.e2e.test.ts +++ b/extensions/discord/src/voice/manager.e2e.test.ts @@ -919,7 +919,7 @@ describe("DiscordVoiceManager", () => { ) => void; } | undefined; - expect(bridgeParams?.autoRespondToAudio).toBe(true); + expect(bridgeParams?.autoRespondToAudio).toBe(false); expect(bridgeParams?.instructions).toContain("same OpenClaw agent"); expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_consult"); @@ -1038,12 +1038,19 @@ describe("DiscordVoiceManager", () => { audioSink?: { sendAudio: (audio: Buffer) => void; }; + onEvent?: (event: { direction: "server"; type: string }) => void; } | undefined; bridgeParams?.audioSink?.sendAudio(Buffer.alloc(480)); expect(createAudioResourceMock).toHaveBeenCalledTimes(1); expect(player.play).toHaveBeenCalledTimes(1); + const firstStream = createAudioResourceMock.mock.calls.at(-1)?.[0] as + | { writableEnded?: boolean } + | undefined; + expect(firstStream?.writableEnded).toBe(false); + bridgeParams?.onEvent?.({ direction: "server", type: "response.done" }); + expect(firstStream?.writableEnded).toBe(true); const idleHandler = player.on.mock.calls.find(([event]) => event === "idle")?.[1] as | (() => void) @@ -1123,6 +1130,7 @@ describe("DiscordVoiceManager", () => { const bridgeParams = createRealtimeVoiceBridgeSessionMock.mock.calls.at(-1)?.[0] as | { onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; + onEvent?: (event: { direction: "server"; type: string }) => void; } | undefined; bridgeParams?.onTranscript?.("user", "non-owner question", true); @@ -1139,10 +1147,7 @@ describe("DiscordVoiceManager", () => { }), expect.anything(), ); - expect(realtimeSessionMock.handleBargeIn).toHaveBeenCalledWith({ - audioPlaybackActive: true, - force: true, - }); + expect(realtimeSessionMock.handleBargeIn).not.toHaveBeenCalled(); expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith( expect.stringContaining("non-owner answer"), ); @@ -1191,6 +1196,7 @@ describe("DiscordVoiceManager", () => { bridgeParams?.onTranscript?.("user", "owner question", true); await new Promise((resolve) => setTimeout(resolve, 260)); + bridgeParams?.onEvent?.({ direction: "server", type: "response.done" }); expect(agentCommandMock).toHaveBeenNthCalledWith( 1, @@ -1216,6 +1222,87 @@ describe("DiscordVoiceManager", () => { ); }); + it("queues forced agent-proxy answers until current realtime playback idles", async () => { + let resolveFirst: ((value: { payloads: Array<{ text: string }> }) => void) | undefined; + let resolveSecond: ((value: { payloads: Array<{ text: string }> }) => void) | undefined; + agentCommandMock + .mockImplementationOnce( + () => + new Promise<{ payloads: Array<{ text: string }> }>((resolve) => { + resolveFirst = resolve; + }), + ) + .mockImplementationOnce( + () => + new Promise<{ payloads: Array<{ text: string }> }>((resolve) => { + resolveSecond = resolve; + }), + ); + const manager = createManager({ + groupPolicy: "open", + voice: { + enabled: true, + mode: "agent-proxy", + realtime: { provider: "openai" }, + }, + }); + + await manager.join({ guildId: "g1", channelId: "1001" }); + const entry = getSessionEntry(manager) as { + realtime?: { + beginSpeakerTurn: ( + context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string }, + userId: string, + ) => { close: () => void; sendInputAudio: (audio: Buffer) => void }; + }; + }; + const player = getLastAudioPlayer() as { + on: ReturnType; + }; + const bridgeParams = createRealtimeVoiceBridgeSessionMock.mock.calls.at(-1)?.[0] as + | { + audioSink?: { sendAudio: (audio: Buffer) => void }; + onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; + } + | undefined; + + const firstTurn = entry.realtime?.beginSpeakerTurn( + { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, + "u-owner", + ); + firstTurn?.sendInputAudio(Buffer.alloc(8)); + bridgeParams?.onTranscript?.("user", "first question", true); + const secondTurn = entry.realtime?.beginSpeakerTurn( + { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" }, + "u-owner", + ); + secondTurn?.sendInputAudio(Buffer.alloc(8)); + bridgeParams?.onTranscript?.("user", "second question", true); + await new Promise((resolve) => setTimeout(resolve, 260)); + + resolveFirst?.({ payloads: [{ text: "first answer" }] }); + await vi.waitFor(() => + expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith( + expect.stringContaining("first answer"), + ), + ); + bridgeParams?.audioSink?.sendAudio(Buffer.alloc(480)); + + resolveSecond?.({ payloads: [{ text: "second answer" }] }); + await new Promise((resolve) => setTimeout(resolve, 0)); + expect(realtimeSessionMock.sendUserMessage).not.toHaveBeenCalledWith( + expect.stringContaining("second answer"), + ); + + const idleHandler = player.on.mock.calls.find(([event]) => event === "idle")?.[1] as + | (() => void) + | undefined; + idleHandler?.(); + expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith( + expect.stringContaining("second answer"), + ); + }); + it("matches agent-proxy consult tool calls to the pending transcript", async () => { agentCommandMock .mockResolvedValueOnce({ payloads: [{ text: "owner answer" }] }) @@ -1250,6 +1337,7 @@ describe("DiscordVoiceManager", () => { session: typeof realtimeSessionMock, ) => void; onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; + onEvent?: (event: { direction: "server"; type: string }) => void; } | undefined; @@ -1522,6 +1610,7 @@ describe("DiscordVoiceManager", () => { expect(realtimeSessionMock.submitToolResult).toHaveBeenCalledWith("call-late", { error: "Discord speaker context changed before this realtime consult completed", }); + bridgeParams?.onEvent?.({ direction: "server", type: "response.done" }); bridgeParams?.onTranscript?.("user", "guest followup", true); await new Promise((resolve) => setTimeout(resolve, 260)); diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts index 6f93a5e6faa..1b56664649d 100644 --- a/extensions/discord/src/voice/realtime.ts +++ b/extensions/discord/src/voice/realtime.ts @@ -273,8 +273,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { private outputAudioRealtimeBytes = 0; private outputAudioChunks = 0; private outputAudioStartedAt: number | undefined; + private outputStreamEnding = false; + private queuedExactSpeechMessages: string[] = []; + private exactSpeechResponseActive = false; + private exactSpeechAudioStarted = false; private readonly playerIdleHandler = () => { this.resetOutputStream("player-idle"); + this.completeExactSpeechResponse("player-idle"); }; constructor( @@ -307,7 +312,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { }), }; }, - deliver: (text) => this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text)), + deliver: (text) => this.enqueueExactSpeechMessage(text), }); } @@ -333,6 +338,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { const consultPolicy = this.realtimeConfig?.consultPolicy ?? (isAgentProxy ? "always" : "auto"); this.consultPolicy = consultPolicy; const usesRealtimeAgentHandoff = this.params.mode === "bidi" || toolPolicy !== "none"; + const autoRespondToAudio = !isAgentProxy || consultPolicy !== "always"; const interruptResponseOnInputAudio = resolveDiscordRealtimeInterruptResponseOnInputAudio({ realtimeConfig: this.realtimeConfig, providerId: resolved.provider.id, @@ -348,7 +354,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { providerConfig: resolved.providerConfig, audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, instructions, - autoRespondToAudio: usesRealtimeAgentHandoff, + autoRespondToAudio, interruptResponseOnInputAudio, markStrategy: "ack-immediately", tools: usesRealtimeAgentHandoff ? resolveRealtimeVoiceAgentConsultTools(toolPolicy) : [], @@ -376,6 +382,15 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { onEvent: (event) => { const detail = event.detail ? ` ${event.detail}` : ""; logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`); + const responseEnded = + event.direction === "server" && + (event.type === "response.done" || event.type === "response.cancelled"); + if (responseEnded) { + if (this.exactSpeechResponseActive && !this.exactSpeechAudioStarted) { + this.completeExactSpeechResponse(event.type); + } + this.finishOutputAudioStream(event.type); + } const interruptionLog = formatRealtimeInterruptionLog(event); if (interruptionLog) { logger.info(interruptionLog); @@ -389,7 +404,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { readProviderConfigString(resolved.providerConfig, "model") ?? resolved.provider.defaultModel; const resolvedVoice = readProviderConfigString(resolved.providerConfig, "voice"); logger.info( - `discord voice: realtime bridge starting mode=${this.params.mode} provider=${resolved.provider.id} model=${resolvedModel ?? "default"} voice=${resolvedVoice ?? "default"} consultPolicy=${consultPolicy} toolPolicy=${toolPolicy} autoRespond=${usesRealtimeAgentHandoff} interruptResponse=${interruptResponseOnInputAudio} bargeIn=${resolveDiscordRealtimeBargeIn( + `discord voice: realtime bridge starting mode=${this.params.mode} provider=${resolved.provider.id} model=${resolvedModel ?? "default"} voice=${resolvedVoice ?? "default"} consultPolicy=${consultPolicy} toolPolicy=${toolPolicy} autoRespond=${autoRespondToAudio} interruptResponse=${interruptResponseOnInputAudio} bargeIn=${resolveDiscordRealtimeBargeIn( { realtimeConfig: this.realtimeConfig, providerId: resolved.provider.id, @@ -411,6 +426,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.pendingAgentProxyConsultContexts = []; this.recentAgentProxyConsultContexts = []; this.pendingSpeakerTurns.length = 0; + this.queuedExactSpeechMessages = []; + this.exactSpeechResponseActive = false; + this.exactSpeechAudioStarted = false; this.clearOutputAudio("session-close"); this.bridge?.close(); this.bridge = null; @@ -522,6 +540,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { } this.syncOutputAudioTimestamp(); const stream = this.ensureOutputStream(); + if (this.exactSpeechResponseActive) { + this.exactSpeechAudioStarted = true; + } stream.write(discordPcm); this.outputAudioDiscordBytes += discordPcm.length; this.outputAudioRealtimeBytes += realtimePcm24kMono.length; @@ -545,6 +566,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.logOutputAudioStopped("stream-close"); this.outputStream = null; this.resetOutputAudioStats(); + this.completeExactSpeechResponse("stream-close"); } }); const resource = voiceSdk.createAudioResource(stream, { @@ -572,6 +594,69 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { stream?.destroy(); } + private finishOutputAudioStream(reason: string): void { + const stream = this.outputStream; + if (!stream || stream.destroyed || this.outputStreamEnding) { + return; + } + this.outputStreamEnding = true; + logger.info( + `discord voice: realtime audio playback finishing reason=${reason} guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} audioMs=${Math.floor(this.outputAudioTimestampMs)} chunks=${this.outputAudioChunks}`, + ); + stream.end(); + } + + private enqueueExactSpeechMessage(text: string): void { + if (this.stopped || !text.trim()) { + return; + } + if (this.exactSpeechResponseActive || this.hasInterruptibleOutputAudio()) { + this.queuedExactSpeechMessages.push(text); + logger.info( + `discord voice: realtime exact speech queued guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} queued=${this.queuedExactSpeechMessages.length} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()}`, + ); + return; + } + this.sendExactSpeechMessage(text); + } + + private sendExactSpeechMessage(text: string): void { + if (this.stopped || !text.trim()) { + return; + } + this.exactSpeechResponseActive = true; + this.exactSpeechAudioStarted = false; + this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text)); + } + + private completeExactSpeechResponse(reason: string): void { + if (!this.exactSpeechResponseActive && this.queuedExactSpeechMessages.length === 0) { + return; + } + this.exactSpeechResponseActive = false; + this.exactSpeechAudioStarted = false; + this.drainQueuedExactSpeechMessages(reason); + } + + private drainQueuedExactSpeechMessages(reason: string): void { + if ( + this.stopped || + this.exactSpeechResponseActive || + this.queuedExactSpeechMessages.length === 0 || + this.hasInterruptibleOutputAudio() + ) { + return; + } + const next = this.queuedExactSpeechMessages.shift(); + if (!next) { + return; + } + logger.info( + `discord voice: realtime exact speech dequeued reason=${reason} guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} queued=${this.queuedExactSpeechMessages.length}`, + ); + this.sendExactSpeechMessage(next); + } + private logOutputAudioStopped(reason: string): void { const audioMs = Math.floor(this.outputAudioTimestampMs); const chunks = this.outputAudioChunks; @@ -591,6 +676,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { this.outputAudioRealtimeBytes = 0; this.outputAudioChunks = 0; this.outputAudioStartedAt = undefined; + this.outputStreamEnding = false; } private syncOutputAudioTimestamp(): void { @@ -802,12 +888,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { logger.info( `discord voice: realtime forced agent consult starting chars=${question.length} voiceSession=${this.params.entry.voiceSessionKey} supervisorSession=${this.params.entry.route.sessionKey} agent=${this.params.entry.route.agentId} speaker=${context.speakerLabel} owner=${context.senderIsOwner}`, ); - this.syncOutputAudioTimestamp(); - logger.info( - `discord voice: realtime barge-in requested reason=forced-agent-consult guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()} playbackChunks=${this.outputAudioChunks} force=true`, - ); - this.bridge?.handleBargeIn({ audioPlaybackActive: true, force: true }); - this.clearOutputAudio("forced-agent-consult"); + if (this.hasInterruptibleOutputAudio()) { + logger.info( + `discord voice: realtime barge-in requested reason=forced-agent-consult guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()} playbackChunks=${this.outputAudioChunks} force=true`, + ); + this.bridge?.handleBargeIn({ audioPlaybackActive: true, force: true }); + this.clearOutputAudio("forced-agent-consult"); + } pending.recent.handledByForcedPlayback = true; try { const promise = this.runAgentTurn({ @@ -823,16 +910,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { `discord voice: realtime forced agent consult answer (${text.length} chars) elapsedMs=${Date.now() - startedAt} voiceSession=${this.params.entry.voiceSessionKey} supervisorSession=${this.params.entry.route.sessionKey} agent=${this.params.entry.route.agentId}: ${formatRealtimeLogPreview(text)}`, ); if (text.trim()) { - this.clearOutputAudio("forced-agent-consult-answer"); - this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text)); + this.enqueueExactSpeechMessage(text); } } catch (error) { logger.warn( `discord voice: realtime forced agent consult failed elapsedMs=${Date.now() - startedAt}: ${formatErrorMessage(error)}`, ); - this.bridge?.sendUserMessage( - buildDiscordSpeakExactUserMessage(DISCORD_REALTIME_FALLBACK_TEXT), - ); + this.enqueueExactSpeechMessage(DISCORD_REALTIME_FALLBACK_TEXT); } }