fix(openai): log realtime voice interruptions

2026-05-09 17:20:43 +00:00 · 2026-05-09 09:36:02 +01:00
parent 07fd11e175
commit 21970c9ac9
5 changed files with 152 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai

 - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
 - OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race.
+- OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions.
 - Gateway: avoid false degraded event-loop health during rapid health/readiness/status probes unless sustained load has delay co-evidence, while keeping hard delay detection immediate. (#77028) Thanks @rubencu.
 - Markdown: keep blockquote spans off trailing paragraph separators. Fixes #79646.
 - Plugin SDK/LM Studio: recover Harmony plain-text tool calls from LM Studio streams. Fixes #78326.
--- a/docs/channels/discord.md
+++ b/docs/channels/discord.md
@@ -1367,6 +1367,7 @@ Expected voice logs:
 - On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...`
 - On agent answer: `discord voice: agent turn answer ...`
 - On same-speaker interruption: `discord voice: realtime barge-in from active speaker audio ...`
+- On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...`
 - On disabled barge-in: `discord voice: realtime capture ignored during playback (barge-in disabled) ...`

 Credentials are resolved per component: LLM route auth for `voice.model`, STT auth for `tools.media.audio`, TTS auth for `messages.tts`/`voice.tts`, and realtime provider auth for `voice.realtime.providers` or the provider's normal auth config.
--- a/extensions/discord/src/voice/realtime.ts
+++ b/extensions/discord/src/voice/realtime.ts
@@ -12,6 +12,7 @@ import {
  resolveRealtimeVoiceAgentConsultToolPolicy,
  resolveRealtimeVoiceAgentConsultTools,
  resolveRealtimeVoiceAgentConsultToolsAllow,
+  type RealtimeVoiceBridgeEvent,
  type RealtimeVoiceAgentTalkbackQueue,
  type RealtimeVoiceAgentConsultToolPolicy,
  type RealtimeVoiceBridgeSession,
@@ -62,6 +63,33 @@ function formatRealtimeLogPreview(text: string): string {
  return `${oneLine.slice(0, DISCORD_REALTIME_LOG_PREVIEW_CHARS)}...`;
 }

+function formatRealtimeInterruptionLog(event: RealtimeVoiceBridgeEvent): string | undefined {
+  const detail = event.detail ? ` ${event.detail}` : "";
+  if (event.direction === "client") {
+    if (event.type === "response.cancel") {
+      return `discord voice: realtime model interrupt requested ${event.direction}:${event.type}${detail}`;
+    }
+    if (event.type === "conversation.item.truncate") {
+      return `discord voice: realtime model audio truncated ${event.direction}:${event.type}${detail}`;
+    }
+  }
+  if (event.direction === "server") {
+    if (event.type === "response.cancelled") {
+      return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
+    }
+    if (event.type === "response.done" && event.detail?.includes("status=cancelled")) {
+      return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
+    }
+    if (
+      event.type === "error" &&
+      event.detail === "Cancellation failed: no active response found"
+    ) {
+      return `discord voice: realtime model interrupt raced ${event.direction}:${event.type}${detail}`;
+    }
+  }
+  return undefined;
+}
+
 function readProviderConfigString(
  config: RealtimeVoiceProviderConfig,
  key: string,
@@ -214,6 +242,10 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
      onEvent: (event) => {
        const detail = event.detail ? ` ${event.detail}` : "";
        logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`);
+        const interruptionLog = formatRealtimeInterruptionLog(event);
+        if (interruptionLog) {
+          logger.info(interruptionLog);
+        }
      },
      onError: (error) =>
        logger.warn(`discord voice: realtime error: ${formatErrorMessage(error)}`),
--- a/extensions/openai/realtime-voice-provider.test.ts
+++ b/extensions/openai/realtime-voice-provider.test.ts
@@ -881,6 +881,98 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
    expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
  });

+  it("does not send duplicate response.cancel while cancellation is pending", async () => {
+    const provider = buildOpenAIRealtimeVoiceProvider();
+    const onEvent = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+      onEvent,
+    });
+    const connecting = bridge.connect();
+    const socket = FakeWebSocket.instances[0];
+    if (!socket) {
+      throw new Error("expected bridge to create a websocket");
+    }
+
+    socket.readyState = FakeWebSocket.OPEN;
+    socket.emit("open");
+    socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
+    await connecting;
+    socket.emit(
+      "message",
+      Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
+    );
+    socket.emit(
+      "message",
+      Buffer.from(
+        JSON.stringify({
+          type: "response.audio.delta",
+          item_id: "item_1",
+          delta: Buffer.from("assistant audio").toString("base64"),
+        }),
+      ),
+    );
+
+    bridge.handleBargeIn?.({ audioPlaybackActive: true });
+    bridge.handleBargeIn?.({ audioPlaybackActive: true });
+
+    expect(parseSent(socket).filter((event) => event.type === "response.cancel")).toHaveLength(1);
+    expect(onEvent).toHaveBeenCalledWith({
+      direction: "client",
+      type: "response.cancel",
+      detail: "reason=barge-in",
+    });
+    expect(onEvent).toHaveBeenCalledWith({
+      direction: "client",
+      type: "conversation.item.truncate",
+      detail: "reason=barge-in audioEndMs=0",
+    });
+  });
+
+  it("drains deferred response.create after a no-active-response cancellation error", async () => {
+    const provider = buildOpenAIRealtimeVoiceProvider();
+    const onError = vi.fn();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+      onError,
+    });
+    const connecting = bridge.connect();
+    const socket = FakeWebSocket.instances[0];
+    if (!socket) {
+      throw new Error("expected bridge to create a websocket");
+    }
+
+    socket.readyState = FakeWebSocket.OPEN;
+    socket.emit("open");
+    socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
+    await connecting;
+    socket.emit(
+      "message",
+      Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
+    );
+
+    bridge.submitToolResult("call_1", { text: "done" });
+    bridge.handleBargeIn?.({ audioPlaybackActive: true });
+    socket.emit(
+      "message",
+      Buffer.from(
+        JSON.stringify({
+          type: "error",
+          error: {
+            message: "Cancellation failed: no active response found",
+          },
+        }),
+      ),
+    );
+
+    expect(onError).not.toHaveBeenCalled();
+    expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
+  });
+
  it("resets deferred response guards after websocket reconnect", async () => {
    vi.useFakeTimers();
    const provider = buildOpenAIRealtimeVoiceProvider();
--- a/extensions/openai/realtime-voice-provider.ts
+++ b/extensions/openai/realtime-voice-provider.ts
@@ -82,6 +82,8 @@ const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-2";
 const OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
 const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX =
  "Conversation already has an active response in progress:";
+const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR =
+  "Cancellation failed: no active response found";

 type RealtimeEvent = {
  type: string;
@@ -791,6 +793,12 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
          this.responseCreatePending = true;
          return;
        }
+        if (detail === OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR) {
+          this.responseActive = false;
+          this.responseCancelInFlight = false;
+          this.flushPendingResponseCreate();
+          return;
+        }
        this.config.onError?.(new Error(detail));
        return;
      }
@@ -807,18 +815,26 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
      responseStartTimestamp !== null &&
      assistantItemId !== null &&
      (this.markQueue.length > 0 || options?.audioPlaybackActive === true);
-    if (options?.audioPlaybackActive === true && this.responseActive) {
-      this.sendEvent({ type: "response.cancel" });
+    if (
+      options?.audioPlaybackActive === true &&
+      this.responseActive &&
+      !this.responseCancelInFlight
+    ) {
+      this.sendEvent({ type: "response.cancel" }, "reason=barge-in");
      this.responseCancelInFlight = true;
    }
    if (shouldInterruptProvider) {
      const elapsedMs = this.latestMediaTimestamp - responseStartTimestamp;
-      this.sendEvent({
-        type: "conversation.item.truncate",
-        item_id: assistantItemId,
-        content_index: 0,
-        audio_end_ms: Math.max(0, elapsedMs),
-      });
+      const audioEndMs = Math.max(0, elapsedMs);
+      this.sendEvent(
+        {
+          type: "conversation.item.truncate",
+          item_id: assistantItemId,
+          content_index: 0,
+          audio_end_ms: audioEndMs,
+        },
+        `reason=barge-in audioEndMs=${audioEndMs}`,
+      );
      this.config.onClearAudio();
      this.markQueue = [];
      this.lastAssistantItemId = null;
@@ -862,13 +878,13 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
    this.config.onMark?.(markName);
  }

-  private sendEvent(event: unknown): void {
+  private sendEvent(event: unknown, detail?: string): void {
    if (this.ws?.readyState === WebSocket.OPEN) {
      const type =
        event && typeof event === "object" && typeof (event as { type?: unknown }).type === "string"
          ? (event as { type: string }).type
          : "unknown";
-      this.config.onEvent?.({ direction: "client", type });
+      this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) });
      const payload = JSON.stringify(event);
      captureWsEvent({
        url: this.resolveConnectionParams().url,