fix(openai): log realtime voice interruptions

This commit is contained in:
Peter Steinberger
2026-05-09 09:36:02 +01:00
parent 07fd11e175
commit 21970c9ac9
5 changed files with 152 additions and 10 deletions

View File

@@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai
- OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
- OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race.
- OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions.
- Gateway: avoid false degraded event-loop health during rapid health/readiness/status probes unless sustained load has delay co-evidence, while keeping hard delay detection immediate. (#77028) Thanks @rubencu.
- Markdown: keep blockquote spans off trailing paragraph separators. Fixes #79646.
- Plugin SDK/LM Studio: recover Harmony plain-text tool calls from LM Studio streams. Fixes #78326.

View File

@@ -1367,6 +1367,7 @@ Expected voice logs:
- On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...`
- On agent answer: `discord voice: agent turn answer ...`
- On same-speaker interruption: `discord voice: realtime barge-in from active speaker audio ...`
- On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...`
- On disabled barge-in: `discord voice: realtime capture ignored during playback (barge-in disabled) ...`
Credentials are resolved per component: LLM route auth for `voice.model`, STT auth for `tools.media.audio`, TTS auth for `messages.tts`/`voice.tts`, and realtime provider auth for `voice.realtime.providers` or the provider's normal auth config.

View File

@@ -12,6 +12,7 @@ import {
resolveRealtimeVoiceAgentConsultToolPolicy,
resolveRealtimeVoiceAgentConsultTools,
resolveRealtimeVoiceAgentConsultToolsAllow,
type RealtimeVoiceBridgeEvent,
type RealtimeVoiceAgentTalkbackQueue,
type RealtimeVoiceAgentConsultToolPolicy,
type RealtimeVoiceBridgeSession,
@@ -62,6 +63,33 @@ function formatRealtimeLogPreview(text: string): string {
return `${oneLine.slice(0, DISCORD_REALTIME_LOG_PREVIEW_CHARS)}...`;
}
function formatRealtimeInterruptionLog(event: RealtimeVoiceBridgeEvent): string | undefined {
const detail = event.detail ? ` ${event.detail}` : "";
if (event.direction === "client") {
if (event.type === "response.cancel") {
return `discord voice: realtime model interrupt requested ${event.direction}:${event.type}${detail}`;
}
if (event.type === "conversation.item.truncate") {
return `discord voice: realtime model audio truncated ${event.direction}:${event.type}${detail}`;
}
}
if (event.direction === "server") {
if (event.type === "response.cancelled") {
return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
}
if (event.type === "response.done" && event.detail?.includes("status=cancelled")) {
return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
}
if (
event.type === "error" &&
event.detail === "Cancellation failed: no active response found"
) {
return `discord voice: realtime model interrupt raced ${event.direction}:${event.type}${detail}`;
}
}
return undefined;
}
function readProviderConfigString(
config: RealtimeVoiceProviderConfig,
key: string,
@@ -214,6 +242,10 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
onEvent: (event) => {
const detail = event.detail ? ` ${event.detail}` : "";
logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`);
const interruptionLog = formatRealtimeInterruptionLog(event);
if (interruptionLog) {
logger.info(interruptionLog);
}
},
onError: (error) =>
logger.warn(`discord voice: realtime error: ${formatErrorMessage(error)}`),

View File

@@ -881,6 +881,98 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
});
it("does not send duplicate response.cancel while cancellation is pending", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const onEvent = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onEvent,
});
const connecting = bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
await connecting;
socket.emit(
"message",
Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
);
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "response.audio.delta",
item_id: "item_1",
delta: Buffer.from("assistant audio").toString("base64"),
}),
),
);
bridge.handleBargeIn?.({ audioPlaybackActive: true });
bridge.handleBargeIn?.({ audioPlaybackActive: true });
expect(parseSent(socket).filter((event) => event.type === "response.cancel")).toHaveLength(1);
expect(onEvent).toHaveBeenCalledWith({
direction: "client",
type: "response.cancel",
detail: "reason=barge-in",
});
expect(onEvent).toHaveBeenCalledWith({
direction: "client",
type: "conversation.item.truncate",
detail: "reason=barge-in audioEndMs=0",
});
});
it("drains deferred response.create after a no-active-response cancellation error", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const onError = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onError,
});
const connecting = bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
await connecting;
socket.emit(
"message",
Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
);
bridge.submitToolResult("call_1", { text: "done" });
bridge.handleBargeIn?.({ audioPlaybackActive: true });
socket.emit(
"message",
Buffer.from(
JSON.stringify({
type: "error",
error: {
message: "Cancellation failed: no active response found",
},
}),
),
);
expect(onError).not.toHaveBeenCalled();
expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
});
it("resets deferred response guards after websocket reconnect", async () => {
vi.useFakeTimers();
const provider = buildOpenAIRealtimeVoiceProvider();

View File

@@ -82,6 +82,8 @@ const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-2";
const OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX =
"Conversation already has an active response in progress:";
const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR =
"Cancellation failed: no active response found";
type RealtimeEvent = {
type: string;
@@ -791,6 +793,12 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.responseCreatePending = true;
return;
}
if (detail === OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR) {
this.responseActive = false;
this.responseCancelInFlight = false;
this.flushPendingResponseCreate();
return;
}
this.config.onError?.(new Error(detail));
return;
}
@@ -807,18 +815,26 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
responseStartTimestamp !== null &&
assistantItemId !== null &&
(this.markQueue.length > 0 || options?.audioPlaybackActive === true);
if (options?.audioPlaybackActive === true && this.responseActive) {
this.sendEvent({ type: "response.cancel" });
if (
options?.audioPlaybackActive === true &&
this.responseActive &&
!this.responseCancelInFlight
) {
this.sendEvent({ type: "response.cancel" }, "reason=barge-in");
this.responseCancelInFlight = true;
}
if (shouldInterruptProvider) {
const elapsedMs = this.latestMediaTimestamp - responseStartTimestamp;
this.sendEvent({
type: "conversation.item.truncate",
item_id: assistantItemId,
content_index: 0,
audio_end_ms: Math.max(0, elapsedMs),
});
const audioEndMs = Math.max(0, elapsedMs);
this.sendEvent(
{
type: "conversation.item.truncate",
item_id: assistantItemId,
content_index: 0,
audio_end_ms: audioEndMs,
},
`reason=barge-in audioEndMs=${audioEndMs}`,
);
this.config.onClearAudio();
this.markQueue = [];
this.lastAssistantItemId = null;
@@ -862,13 +878,13 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.config.onMark?.(markName);
}
private sendEvent(event: unknown): void {
private sendEvent(event: unknown, detail?: string): void {
if (this.ws?.readyState === WebSocket.OPEN) {
const type =
event && typeof event === "object" && typeof (event as { type?: unknown }).type === "string"
? (event as { type: string }).type
: "unknown";
this.config.onEvent?.({ direction: "client", type });
this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) });
const payload = JSON.stringify(event);
captureWsEvent({
url: this.resolveConnectionParams().url,