diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e7b725cbb6..ab2c735798c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358. - OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race. +- OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions. - Gateway: avoid false degraded event-loop health during rapid health/readiness/status probes unless sustained load has delay co-evidence, while keeping hard delay detection immediate. (#77028) Thanks @rubencu. - Markdown: keep blockquote spans off trailing paragraph separators. Fixes #79646. - Plugin SDK/LM Studio: recover Harmony plain-text tool calls from LM Studio streams. Fixes #78326. diff --git a/docs/channels/discord.md b/docs/channels/discord.md index ff37a0e4d9f..674847c938b 100644 --- a/docs/channels/discord.md +++ b/docs/channels/discord.md @@ -1367,6 +1367,7 @@ Expected voice logs: - On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...` - On agent answer: `discord voice: agent turn answer ...` - On same-speaker interruption: `discord voice: realtime barge-in from active speaker audio ...` +- On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...` - On disabled barge-in: `discord voice: realtime capture ignored during playback (barge-in disabled) ...` Credentials are resolved per component: LLM route auth for `voice.model`, STT auth for `tools.media.audio`, TTS auth for `messages.tts`/`voice.tts`, and realtime provider auth for `voice.realtime.providers` or the provider's normal auth config. diff --git a/extensions/discord/src/voice/realtime.ts b/extensions/discord/src/voice/realtime.ts index 7f089de79d3..6caa896f2a9 100644 --- a/extensions/discord/src/voice/realtime.ts +++ b/extensions/discord/src/voice/realtime.ts @@ -12,6 +12,7 @@ import { resolveRealtimeVoiceAgentConsultToolPolicy, resolveRealtimeVoiceAgentConsultTools, resolveRealtimeVoiceAgentConsultToolsAllow, + type RealtimeVoiceBridgeEvent, type RealtimeVoiceAgentTalkbackQueue, type RealtimeVoiceAgentConsultToolPolicy, type RealtimeVoiceBridgeSession, @@ -62,6 +63,33 @@ function formatRealtimeLogPreview(text: string): string { return `${oneLine.slice(0, DISCORD_REALTIME_LOG_PREVIEW_CHARS)}...`; } +function formatRealtimeInterruptionLog(event: RealtimeVoiceBridgeEvent): string | undefined { + const detail = event.detail ? ` ${event.detail}` : ""; + if (event.direction === "client") { + if (event.type === "response.cancel") { + return `discord voice: realtime model interrupt requested ${event.direction}:${event.type}${detail}`; + } + if (event.type === "conversation.item.truncate") { + return `discord voice: realtime model audio truncated ${event.direction}:${event.type}${detail}`; + } + } + if (event.direction === "server") { + if (event.type === "response.cancelled") { + return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`; + } + if (event.type === "response.done" && event.detail?.includes("status=cancelled")) { + return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`; + } + if ( + event.type === "error" && + event.detail === "Cancellation failed: no active response found" + ) { + return `discord voice: realtime model interrupt raced ${event.direction}:${event.type}${detail}`; + } + } + return undefined; +} + function readProviderConfigString( config: RealtimeVoiceProviderConfig, key: string, @@ -214,6 +242,10 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession { onEvent: (event) => { const detail = event.detail ? ` ${event.detail}` : ""; logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`); + const interruptionLog = formatRealtimeInterruptionLog(event); + if (interruptionLog) { + logger.info(interruptionLog); + } }, onError: (error) => logger.warn(`discord voice: realtime error: ${formatErrorMessage(error)}`), diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 06511fff55d..1a7a67c4a0c 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -881,6 +881,98 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]); }); + it("does not send duplicate response.cancel while cancellation is pending", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onEvent = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onEvent, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + socket.emit( + "message", + Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.audio.delta", + item_id: "item_1", + delta: Buffer.from("assistant audio").toString("base64"), + }), + ), + ); + + bridge.handleBargeIn?.({ audioPlaybackActive: true }); + bridge.handleBargeIn?.({ audioPlaybackActive: true }); + + expect(parseSent(socket).filter((event) => event.type === "response.cancel")).toHaveLength(1); + expect(onEvent).toHaveBeenCalledWith({ + direction: "client", + type: "response.cancel", + detail: "reason=barge-in", + }); + expect(onEvent).toHaveBeenCalledWith({ + direction: "client", + type: "conversation.item.truncate", + detail: "reason=barge-in audioEndMs=0", + }); + }); + + it("drains deferred response.create after a no-active-response cancellation error", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onError = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onError, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + socket.emit( + "message", + Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })), + ); + + bridge.submitToolResult("call_1", { text: "done" }); + bridge.handleBargeIn?.({ audioPlaybackActive: true }); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "error", + error: { + message: "Cancellation failed: no active response found", + }, + }), + ), + ); + + expect(onError).not.toHaveBeenCalled(); + expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]); + }); + it("resets deferred response guards after websocket reconnect", async () => { vi.useFakeTimers(); const provider = buildOpenAIRealtimeVoiceProvider(); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index c7ecb6dd6d1..971ca2cf003 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -82,6 +82,8 @@ const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-2"; const OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"; const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX = "Conversation already has an active response in progress:"; +const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR = + "Cancellation failed: no active response found"; type RealtimeEvent = { type: string; @@ -791,6 +793,12 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.responseCreatePending = true; return; } + if (detail === OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR) { + this.responseActive = false; + this.responseCancelInFlight = false; + this.flushPendingResponseCreate(); + return; + } this.config.onError?.(new Error(detail)); return; } @@ -807,18 +815,26 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { responseStartTimestamp !== null && assistantItemId !== null && (this.markQueue.length > 0 || options?.audioPlaybackActive === true); - if (options?.audioPlaybackActive === true && this.responseActive) { - this.sendEvent({ type: "response.cancel" }); + if ( + options?.audioPlaybackActive === true && + this.responseActive && + !this.responseCancelInFlight + ) { + this.sendEvent({ type: "response.cancel" }, "reason=barge-in"); this.responseCancelInFlight = true; } if (shouldInterruptProvider) { const elapsedMs = this.latestMediaTimestamp - responseStartTimestamp; - this.sendEvent({ - type: "conversation.item.truncate", - item_id: assistantItemId, - content_index: 0, - audio_end_ms: Math.max(0, elapsedMs), - }); + const audioEndMs = Math.max(0, elapsedMs); + this.sendEvent( + { + type: "conversation.item.truncate", + item_id: assistantItemId, + content_index: 0, + audio_end_ms: audioEndMs, + }, + `reason=barge-in audioEndMs=${audioEndMs}`, + ); this.config.onClearAudio(); this.markQueue = []; this.lastAssistantItemId = null; @@ -862,13 +878,13 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.config.onMark?.(markName); } - private sendEvent(event: unknown): void { + private sendEvent(event: unknown, detail?: string): void { if (this.ws?.readyState === WebSocket.OPEN) { const type = event && typeof event === "object" && typeof (event as { type?: unknown }).type === "string" ? (event as { type: string }).type : "unknown"; - this.config.onEvent?.({ direction: "client", type }); + this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) }); const payload = JSON.stringify(event); captureWsEvent({ url: this.resolveConnectionParams().url,