From f83dbbc926f2eaa6e59dbae9051ca8f627d3d517 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sun, 10 May 2026 05:00:26 +0100
Subject: [PATCH] fix(discord): prevent realtime answer replacement

---
 CHANGELOG.md                                  |   1 +
 docs/channels/discord.md                      |   8 +-
 .../discord/src/voice/manager.e2e.test.ts     |  99 +++++++++++++++-
 extensions/discord/src/voice/realtime.ts      | 112 +++++++++++++++---
 4 files changed, 198 insertions(+), 22 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 50b8142cd1a..c60e2429241 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Discord/voice: keep default agent-proxy realtime sessions from auto-speaking filler before the forced OpenClaw consult answer, finish Discord playback on realtime response completion, and queue later exact-speech answers until playback idles to avoid mid-sentence replacement.
 - OpenAI/realtime voice: honor disabled input-audio interruption locally so server VAD speech-start events do not clear Discord playback after operators set `interruptResponseOnInputAudio: false`.
 - Telegram: handle managed select button callbacks before the raw callback fallback while preserving delimiter-containing option values such as `env|prod`. (#79816) Thanks @moeedahmed.
 - CLI/media: let explicit image description model refs use bundled static provider catalogs and generic model-backed image hooks, so `openclaw infer image describe --model zai/glm-4.6v` works like direct model runs and Anthropic auth probes avoid stale Claude 3 Haiku catalog entries.
diff --git a/docs/channels/discord.md b/docs/channels/discord.md
index 866551e7bc5..603e3b589f9 100644
--- a/docs/channels/discord.md
+++ b/docs/channels/discord.md
@@ -1200,7 +1200,7 @@ Notes:
 - `voice.mode` controls the conversation path. The default is `agent-proxy`: a realtime voice front end handles turn timing, interruption, and playback, delegates substantive work to the routed OpenClaw agent through `openclaw_agent_consult`, and treats the result like a typed Discord prompt from that speaker. `stt-tts` keeps the older batch STT plus TTS flow. `bidi` lets the realtime model converse directly while exposing `openclaw_agent_consult` for the OpenClaw brain.
 - `voice.agentSession` controls which OpenClaw conversation receives voice turns. Leave it unset for the voice channel's own session, or set `{ mode: "target", target: "channel:<text-channel-id>" }` to make the voice channel act as the microphone/speaker extension of an existing Discord text channel session such as `#maintainers`.
 - `voice.model` overrides the OpenClaw agent brain for Discord voice responses and realtime consults. Leave it unset to inherit the routed agent model. It is separate from `voice.realtime.model`.
-- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`).
+- `agent-proxy` routes speech through `discord-voice`, which preserves normal owner/tool authorization for the speaker and target session but hides the agent `tts` tool because Discord voice owns playback. By default, `agent-proxy` gives the consult full owner-equivalent tool access for owner speakers (`voice.realtime.toolPolicy: "owner"`) and strongly prefers consulting the OpenClaw agent before substantive answers (`voice.realtime.consultPolicy: "always"`). In that default `always` mode, the realtime layer does not auto-speak filler before the consult answer; it captures and transcribes speech, then speaks the routed OpenClaw answer. If multiple forced consult answers finish while Discord is still playing the first answer, later exact-speech answers are queued until playback idles instead of replacing speech mid-sentence.
 - In `stt-tts` mode, STT uses `tools.media.audio`; `voice.model` does not affect transcription.
 - In realtime modes, `voice.realtime.provider`, `voice.realtime.model`, and `voice.realtime.voice` configure the realtime audio session. For OpenAI Realtime 2 plus the Codex brain, use `voice.realtime.model: "gpt-realtime-2"` and `voice.model: "openai-codex/gpt-5.5"`.
 - `voice.realtime.bargeIn` controls whether Discord speaker-start events interrupt active realtime playback. If unset, it follows the realtime provider's input-audio interruption setting.
@@ -1368,11 +1368,13 @@ Use this when the model hears its own Discord playback through an open mic, but
 Expected voice logs:
 
 - On join: `discord voice: joining ... voiceSession=... supervisorSession=... agentSessionMode=... voiceModel=... realtimeModel=...`
-- On realtime start: `discord voice: realtime bridge starting ... interruptResponse=false bargeIn=true minBargeInAudioEndMs=...`
+- On realtime start: `discord voice: realtime bridge starting ... autoRespond=false interruptResponse=false bargeIn=false minBargeInAudioEndMs=...`
 - On speaker audio: `discord voice: realtime speaker turn opened ...`, `discord voice: realtime input audio started ... outputAudioMs=... outputActive=...`, and `discord voice: realtime speaker turn closed ... chunks=... discordBytes=... realtimeBytes=... interruptedPlayback=...`
+- On realtime response completion: `discord voice: realtime audio playback finishing reason=response.done ... audioMs=... chunks=...`
 - On playback stop/reset: `discord voice: realtime audio playback stopped reason=... audioMs=... elapsedMs=... chunks=...`
 - On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...`
 - On agent answer: `discord voice: agent turn answer ...`
+- On queued exact speech: `discord voice: realtime exact speech queued ... queued=... outputAudioMs=... outputActive=...`, followed by `discord voice: realtime exact speech dequeued reason=player-idle ...`
 - On barge-in detection: `discord voice: realtime barge-in detected source=speaker-start ...` or `discord voice: realtime barge-in detected source=active-speaker-audio ...`, followed by `discord voice: realtime barge-in requested reason=... outputAudioMs=... outputActive=...`
 - On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...`
 - On ignored echo/noise: `discord voice: realtime model interrupt ignored client:conversation.item.truncate.skipped reason=barge-in audioEndMs=0 minAudioEndMs=250`
@@ -1386,7 +1388,7 @@ To debug cut-off audio, read the realtime voice logs as a timeline:
 3. `realtime input audio started` marks the first actual audio frame received for that speaker turn. `outputActive=true` or a nonzero `outputAudioMs` here means the mic is sending input while assistant playback is still active.
 4. `barge-in detected source=active-speaker-audio` means OpenClaw saw live speaker audio while assistant playback was active. This is useful for distinguishing a real interruption from a Discord speaker-start event with no useful audio.
 5. `barge-in requested reason=...` means OpenClaw asked the realtime provider to cancel or truncate the active response. It includes `outputAudioMs`, `outputActive`, and `playbackChunks` so you can see how much assistant audio had actually played before the interruption.
-6. `realtime audio playback stopped reason=...` is the local Discord playback reset point. The reason says who stopped playback: `barge-in`, `player-idle`, `provider-clear-audio`, `forced-agent-consult`, `forced-agent-consult-answer`, `stream-close`, or `session-close`.
+6. `realtime audio playback stopped reason=...` is the local Discord playback reset point. The reason says who stopped playback: `barge-in`, `player-idle`, `provider-clear-audio`, `forced-agent-consult`, `stream-close`, or `session-close`.
 7. `realtime speaker turn closed` summarizes the captured input turn. `chunks=0` or `hasAudio=false` means the speaker turn opened but no usable audio reached the realtime bridge. `interruptedPlayback=true` means that input turn overlapped assistant output and triggered barge-in logic.
 
 Useful fields:
diff --git a/extensions/discord/src/voice/manager.e2e.test.ts b/extensions/discord/src/voice/manager.e2e.test.ts
index 661f6cf8c85..dab1d7b2222 100644
--- a/extensions/discord/src/voice/manager.e2e.test.ts
+++ b/extensions/discord/src/voice/manager.e2e.test.ts
@@ -919,7 +919,7 @@ describe("DiscordVoiceManager", () => {
           ) => void;
         }
       | undefined;
-    expect(bridgeParams?.autoRespondToAudio).toBe(true);
+    expect(bridgeParams?.autoRespondToAudio).toBe(false);
     expect(bridgeParams?.instructions).toContain("same OpenClaw agent");
     expect(bridgeParams?.tools?.map((tool) => tool.name)).toContain("openclaw_agent_consult");
 
@@ -1038,12 +1038,19 @@ describe("DiscordVoiceManager", () => {
           audioSink?: {
             sendAudio: (audio: Buffer) => void;
           };
+          onEvent?: (event: { direction: "server"; type: string }) => void;
         }
       | undefined;
 
     bridgeParams?.audioSink?.sendAudio(Buffer.alloc(480));
     expect(createAudioResourceMock).toHaveBeenCalledTimes(1);
     expect(player.play).toHaveBeenCalledTimes(1);
+    const firstStream = createAudioResourceMock.mock.calls.at(-1)?.[0] as
+      | { writableEnded?: boolean }
+      | undefined;
+    expect(firstStream?.writableEnded).toBe(false);
+    bridgeParams?.onEvent?.({ direction: "server", type: "response.done" });
+    expect(firstStream?.writableEnded).toBe(true);
 
     const idleHandler = player.on.mock.calls.find(([event]) => event === "idle")?.[1] as
       | (() => void)
@@ -1123,6 +1130,7 @@ describe("DiscordVoiceManager", () => {
     const bridgeParams = createRealtimeVoiceBridgeSessionMock.mock.calls.at(-1)?.[0] as
       | {
           onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
+          onEvent?: (event: { direction: "server"; type: string }) => void;
         }
       | undefined;
     bridgeParams?.onTranscript?.("user", "non-owner question", true);
@@ -1139,10 +1147,7 @@ describe("DiscordVoiceManager", () => {
       }),
       expect.anything(),
     );
-    expect(realtimeSessionMock.handleBargeIn).toHaveBeenCalledWith({
-      audioPlaybackActive: true,
-      force: true,
-    });
+    expect(realtimeSessionMock.handleBargeIn).not.toHaveBeenCalled();
     expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith(
       expect.stringContaining("non-owner answer"),
     );
@@ -1191,6 +1196,7 @@ describe("DiscordVoiceManager", () => {
     bridgeParams?.onTranscript?.("user", "owner question", true);
 
     await new Promise((resolve) => setTimeout(resolve, 260));
+    bridgeParams?.onEvent?.({ direction: "server", type: "response.done" });
 
     expect(agentCommandMock).toHaveBeenNthCalledWith(
       1,
@@ -1216,6 +1222,87 @@ describe("DiscordVoiceManager", () => {
     );
   });
 
+  it("queues forced agent-proxy answers until current realtime playback idles", async () => {
+    let resolveFirst: ((value: { payloads: Array<{ text: string }> }) => void) | undefined;
+    let resolveSecond: ((value: { payloads: Array<{ text: string }> }) => void) | undefined;
+    agentCommandMock
+      .mockImplementationOnce(
+        () =>
+          new Promise<{ payloads: Array<{ text: string }> }>((resolve) => {
+            resolveFirst = resolve;
+          }),
+      )
+      .mockImplementationOnce(
+        () =>
+          new Promise<{ payloads: Array<{ text: string }> }>((resolve) => {
+            resolveSecond = resolve;
+          }),
+      );
+    const manager = createManager({
+      groupPolicy: "open",
+      voice: {
+        enabled: true,
+        mode: "agent-proxy",
+        realtime: { provider: "openai" },
+      },
+    });
+
+    await manager.join({ guildId: "g1", channelId: "1001" });
+    const entry = getSessionEntry(manager) as {
+      realtime?: {
+        beginSpeakerTurn: (
+          context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
+          userId: string,
+        ) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
+      };
+    };
+    const player = getLastAudioPlayer() as {
+      on: ReturnType<typeof vi.fn>;
+    };
+    const bridgeParams = createRealtimeVoiceBridgeSessionMock.mock.calls.at(-1)?.[0] as
+      | {
+          audioSink?: { sendAudio: (audio: Buffer) => void };
+          onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
+        }
+      | undefined;
+
+    const firstTurn = entry.realtime?.beginSpeakerTurn(
+      { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
+      "u-owner",
+    );
+    firstTurn?.sendInputAudio(Buffer.alloc(8));
+    bridgeParams?.onTranscript?.("user", "first question", true);
+    const secondTurn = entry.realtime?.beginSpeakerTurn(
+      { extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
+      "u-owner",
+    );
+    secondTurn?.sendInputAudio(Buffer.alloc(8));
+    bridgeParams?.onTranscript?.("user", "second question", true);
+    await new Promise((resolve) => setTimeout(resolve, 260));
+
+    resolveFirst?.({ payloads: [{ text: "first answer" }] });
+    await vi.waitFor(() =>
+      expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith(
+        expect.stringContaining("first answer"),
+      ),
+    );
+    bridgeParams?.audioSink?.sendAudio(Buffer.alloc(480));
+
+    resolveSecond?.({ payloads: [{ text: "second answer" }] });
+    await new Promise((resolve) => setTimeout(resolve, 0));
+    expect(realtimeSessionMock.sendUserMessage).not.toHaveBeenCalledWith(
+      expect.stringContaining("second answer"),
+    );
+
+    const idleHandler = player.on.mock.calls.find(([event]) => event === "idle")?.[1] as
+      | (() => void)
+      | undefined;
+    idleHandler?.();
+    expect(realtimeSessionMock.sendUserMessage).toHaveBeenCalledWith(
+      expect.stringContaining("second answer"),
+    );
+  });
+
   it("matches agent-proxy consult tool calls to the pending transcript", async () => {
     agentCommandMock
       .mockResolvedValueOnce({ payloads: [{ text: "owner answer" }] })
@@ -1250,6 +1337,7 @@ describe("DiscordVoiceManager", () => {
             session: typeof realtimeSessionMock,
           ) => void;
           onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
+          onEvent?: (event: { direction: "server"; type: string }) => void;
         }
       | undefined;
 
@@ -1522,6 +1610,7 @@ describe("DiscordVoiceManager", () => {
     expect(realtimeSessionMock.submitToolResult).toHaveBeenCalledWith("call-late", {
       error: "Discord speaker context changed before this realtime consult completed",
     });
+    bridgeParams?.onEvent?.({ direction: "server", type: "response.done" });
 
     bridgeParams?.onTranscript?.("user", "guest followup", true);
     await new Promise((resolve) => setTimeout(resolve, 260));
diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts
index 6f93a5e6faa..1b56664649d 100644
--- a/extensions/discord/src/voice/realtime.ts
+++ b/extensions/discord/src/voice/realtime.ts
@@ -273,8 +273,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
   private outputAudioRealtimeBytes = 0;
   private outputAudioChunks = 0;
   private outputAudioStartedAt: number | undefined;
+  private outputStreamEnding = false;
+  private queuedExactSpeechMessages: string[] = [];
+  private exactSpeechResponseActive = false;
+  private exactSpeechAudioStarted = false;
   private readonly playerIdleHandler = () => {
     this.resetOutputStream("player-idle");
+    this.completeExactSpeechResponse("player-idle");
   };
 
   constructor(
@@ -307,7 +312,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
           }),
         };
       },
-      deliver: (text) => this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text)),
+      deliver: (text) => this.enqueueExactSpeechMessage(text),
     });
   }
 
@@ -333,6 +338,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     const consultPolicy = this.realtimeConfig?.consultPolicy ?? (isAgentProxy ? "always" : "auto");
     this.consultPolicy = consultPolicy;
     const usesRealtimeAgentHandoff = this.params.mode === "bidi" || toolPolicy !== "none";
+    const autoRespondToAudio = !isAgentProxy || consultPolicy !== "always";
     const interruptResponseOnInputAudio = resolveDiscordRealtimeInterruptResponseOnInputAudio({
       realtimeConfig: this.realtimeConfig,
       providerId: resolved.provider.id,
@@ -348,7 +354,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
       providerConfig: resolved.providerConfig,
       audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
       instructions,
-      autoRespondToAudio: usesRealtimeAgentHandoff,
+      autoRespondToAudio,
       interruptResponseOnInputAudio,
       markStrategy: "ack-immediately",
       tools: usesRealtimeAgentHandoff ? resolveRealtimeVoiceAgentConsultTools(toolPolicy) : [],
@@ -376,6 +382,15 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
       onEvent: (event) => {
         const detail = event.detail ? ` ${event.detail}` : "";
         logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`);
+        const responseEnded =
+          event.direction === "server" &&
+          (event.type === "response.done" || event.type === "response.cancelled");
+        if (responseEnded) {
+          if (this.exactSpeechResponseActive && !this.exactSpeechAudioStarted) {
+            this.completeExactSpeechResponse(event.type);
+          }
+          this.finishOutputAudioStream(event.type);
+        }
         const interruptionLog = formatRealtimeInterruptionLog(event);
         if (interruptionLog) {
           logger.info(interruptionLog);
@@ -389,7 +404,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
       readProviderConfigString(resolved.providerConfig, "model") ?? resolved.provider.defaultModel;
     const resolvedVoice = readProviderConfigString(resolved.providerConfig, "voice");
     logger.info(
-      `discord voice: realtime bridge starting mode=${this.params.mode} provider=${resolved.provider.id} model=${resolvedModel ?? "default"} voice=${resolvedVoice ?? "default"} consultPolicy=${consultPolicy} toolPolicy=${toolPolicy} autoRespond=${usesRealtimeAgentHandoff} interruptResponse=${interruptResponseOnInputAudio} bargeIn=${resolveDiscordRealtimeBargeIn(
+      `discord voice: realtime bridge starting mode=${this.params.mode} provider=${resolved.provider.id} model=${resolvedModel ?? "default"} voice=${resolvedVoice ?? "default"} consultPolicy=${consultPolicy} toolPolicy=${toolPolicy} autoRespond=${autoRespondToAudio} interruptResponse=${interruptResponseOnInputAudio} bargeIn=${resolveDiscordRealtimeBargeIn(
         {
           realtimeConfig: this.realtimeConfig,
           providerId: resolved.provider.id,
@@ -411,6 +426,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     this.pendingAgentProxyConsultContexts = [];
     this.recentAgentProxyConsultContexts = [];
     this.pendingSpeakerTurns.length = 0;
+    this.queuedExactSpeechMessages = [];
+    this.exactSpeechResponseActive = false;
+    this.exactSpeechAudioStarted = false;
     this.clearOutputAudio("session-close");
     this.bridge?.close();
     this.bridge = null;
@@ -522,6 +540,9 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     }
     this.syncOutputAudioTimestamp();
     const stream = this.ensureOutputStream();
+    if (this.exactSpeechResponseActive) {
+      this.exactSpeechAudioStarted = true;
+    }
     stream.write(discordPcm);
     this.outputAudioDiscordBytes += discordPcm.length;
     this.outputAudioRealtimeBytes += realtimePcm24kMono.length;
@@ -545,6 +566,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
         this.logOutputAudioStopped("stream-close");
         this.outputStream = null;
         this.resetOutputAudioStats();
+        this.completeExactSpeechResponse("stream-close");
       }
     });
     const resource = voiceSdk.createAudioResource(stream, {
@@ -572,6 +594,69 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     stream?.destroy();
   }
 
+  private finishOutputAudioStream(reason: string): void {
+    const stream = this.outputStream;
+    if (!stream || stream.destroyed || this.outputStreamEnding) {
+      return;
+    }
+    this.outputStreamEnding = true;
+    logger.info(
+      `discord voice: realtime audio playback finishing reason=${reason} guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} audioMs=${Math.floor(this.outputAudioTimestampMs)} chunks=${this.outputAudioChunks}`,
+    );
+    stream.end();
+  }
+
+  private enqueueExactSpeechMessage(text: string): void {
+    if (this.stopped || !text.trim()) {
+      return;
+    }
+    if (this.exactSpeechResponseActive || this.hasInterruptibleOutputAudio()) {
+      this.queuedExactSpeechMessages.push(text);
+      logger.info(
+        `discord voice: realtime exact speech queued guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} queued=${this.queuedExactSpeechMessages.length} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()}`,
+      );
+      return;
+    }
+    this.sendExactSpeechMessage(text);
+  }
+
+  private sendExactSpeechMessage(text: string): void {
+    if (this.stopped || !text.trim()) {
+      return;
+    }
+    this.exactSpeechResponseActive = true;
+    this.exactSpeechAudioStarted = false;
+    this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text));
+  }
+
+  private completeExactSpeechResponse(reason: string): void {
+    if (!this.exactSpeechResponseActive && this.queuedExactSpeechMessages.length === 0) {
+      return;
+    }
+    this.exactSpeechResponseActive = false;
+    this.exactSpeechAudioStarted = false;
+    this.drainQueuedExactSpeechMessages(reason);
+  }
+
+  private drainQueuedExactSpeechMessages(reason: string): void {
+    if (
+      this.stopped ||
+      this.exactSpeechResponseActive ||
+      this.queuedExactSpeechMessages.length === 0 ||
+      this.hasInterruptibleOutputAudio()
+    ) {
+      return;
+    }
+    const next = this.queuedExactSpeechMessages.shift();
+    if (!next) {
+      return;
+    }
+    logger.info(
+      `discord voice: realtime exact speech dequeued reason=${reason} guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} queued=${this.queuedExactSpeechMessages.length}`,
+    );
+    this.sendExactSpeechMessage(next);
+  }
+
   private logOutputAudioStopped(reason: string): void {
     const audioMs = Math.floor(this.outputAudioTimestampMs);
     const chunks = this.outputAudioChunks;
@@ -591,6 +676,7 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     this.outputAudioRealtimeBytes = 0;
     this.outputAudioChunks = 0;
     this.outputAudioStartedAt = undefined;
+    this.outputStreamEnding = false;
   }
 
   private syncOutputAudioTimestamp(): void {
@@ -802,12 +888,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
     logger.info(
       `discord voice: realtime forced agent consult starting chars=${question.length} voiceSession=${this.params.entry.voiceSessionKey} supervisorSession=${this.params.entry.route.sessionKey} agent=${this.params.entry.route.agentId} speaker=${context.speakerLabel} owner=${context.senderIsOwner}`,
     );
-    this.syncOutputAudioTimestamp();
-    logger.info(
-      `discord voice: realtime barge-in requested reason=forced-agent-consult guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()} playbackChunks=${this.outputAudioChunks} force=true`,
-    );
-    this.bridge?.handleBargeIn({ audioPlaybackActive: true, force: true });
-    this.clearOutputAudio("forced-agent-consult");
+    if (this.hasInterruptibleOutputAudio()) {
+      logger.info(
+        `discord voice: realtime barge-in requested reason=forced-agent-consult guild=${this.params.entry.guildId} channel=${this.params.entry.channelId} outputAudioMs=${Math.floor(this.outputAudioTimestampMs)} outputActive=${this.isOutputAudioActive()} playbackChunks=${this.outputAudioChunks} force=true`,
+      );
+      this.bridge?.handleBargeIn({ audioPlaybackActive: true, force: true });
+      this.clearOutputAudio("forced-agent-consult");
+    }
     pending.recent.handledByForcedPlayback = true;
     try {
       const promise = this.runAgentTurn({
@@ -823,16 +910,13 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
         `discord voice: realtime forced agent consult answer (${text.length} chars) elapsedMs=${Date.now() - startedAt} voiceSession=${this.params.entry.voiceSessionKey} supervisorSession=${this.params.entry.route.sessionKey} agent=${this.params.entry.route.agentId}: ${formatRealtimeLogPreview(text)}`,
       );
       if (text.trim()) {
-        this.clearOutputAudio("forced-agent-consult-answer");
-        this.bridge?.sendUserMessage(buildDiscordSpeakExactUserMessage(text));
+        this.enqueueExactSpeechMessage(text);
       }
     } catch (error) {
       logger.warn(
         `discord voice: realtime forced agent consult failed elapsedMs=${Date.now() - startedAt}: ${formatErrorMessage(error)}`,
       );
-      this.bridge?.sendUserMessage(
-        buildDiscordSpeakExactUserMessage(DISCORD_REALTIME_FALLBACK_TEXT),
-      );
+      this.enqueueExactSpeechMessage(DISCORD_REALTIME_FALLBACK_TEXT);
     }
   }