mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 17:20:43 +00:00
fix(openai): log realtime voice interruptions
This commit is contained in:
@@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
- OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
|
||||
- OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race.
|
||||
- OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions.
|
||||
- Gateway: avoid false degraded event-loop health during rapid health/readiness/status probes unless sustained load has delay co-evidence, while keeping hard delay detection immediate. (#77028) Thanks @rubencu.
|
||||
- Markdown: keep blockquote spans off trailing paragraph separators. Fixes #79646.
|
||||
- Plugin SDK/LM Studio: recover Harmony plain-text tool calls from LM Studio streams. Fixes #78326.
|
||||
|
||||
@@ -1367,6 +1367,7 @@ Expected voice logs:
|
||||
- On realtime consult: `discord voice: realtime consult requested ... voiceSession=... supervisorSession=... question=...`
|
||||
- On agent answer: `discord voice: agent turn answer ...`
|
||||
- On same-speaker interruption: `discord voice: realtime barge-in from active speaker audio ...`
|
||||
- On realtime interruption: `discord voice: realtime model interrupt requested client:response.cancel reason=barge-in`, followed by either `discord voice: realtime model audio truncated client:conversation.item.truncate reason=barge-in audioEndMs=...` or `discord voice: realtime model interrupt confirmed server:response.done status=cancelled ...`
|
||||
- On disabled barge-in: `discord voice: realtime capture ignored during playback (barge-in disabled) ...`
|
||||
|
||||
Credentials are resolved per component: LLM route auth for `voice.model`, STT auth for `tools.media.audio`, TTS auth for `messages.tts`/`voice.tts`, and realtime provider auth for `voice.realtime.providers` or the provider's normal auth config.
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
resolveRealtimeVoiceAgentConsultToolPolicy,
|
||||
resolveRealtimeVoiceAgentConsultTools,
|
||||
resolveRealtimeVoiceAgentConsultToolsAllow,
|
||||
type RealtimeVoiceBridgeEvent,
|
||||
type RealtimeVoiceAgentTalkbackQueue,
|
||||
type RealtimeVoiceAgentConsultToolPolicy,
|
||||
type RealtimeVoiceBridgeSession,
|
||||
@@ -62,6 +63,33 @@ function formatRealtimeLogPreview(text: string): string {
|
||||
return `${oneLine.slice(0, DISCORD_REALTIME_LOG_PREVIEW_CHARS)}...`;
|
||||
}
|
||||
|
||||
function formatRealtimeInterruptionLog(event: RealtimeVoiceBridgeEvent): string | undefined {
|
||||
const detail = event.detail ? ` ${event.detail}` : "";
|
||||
if (event.direction === "client") {
|
||||
if (event.type === "response.cancel") {
|
||||
return `discord voice: realtime model interrupt requested ${event.direction}:${event.type}${detail}`;
|
||||
}
|
||||
if (event.type === "conversation.item.truncate") {
|
||||
return `discord voice: realtime model audio truncated ${event.direction}:${event.type}${detail}`;
|
||||
}
|
||||
}
|
||||
if (event.direction === "server") {
|
||||
if (event.type === "response.cancelled") {
|
||||
return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
|
||||
}
|
||||
if (event.type === "response.done" && event.detail?.includes("status=cancelled")) {
|
||||
return `discord voice: realtime model interrupt confirmed ${event.direction}:${event.type}${detail}`;
|
||||
}
|
||||
if (
|
||||
event.type === "error" &&
|
||||
event.detail === "Cancellation failed: no active response found"
|
||||
) {
|
||||
return `discord voice: realtime model interrupt raced ${event.direction}:${event.type}${detail}`;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function readProviderConfigString(
|
||||
config: RealtimeVoiceProviderConfig,
|
||||
key: string,
|
||||
@@ -214,6 +242,10 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
|
||||
onEvent: (event) => {
|
||||
const detail = event.detail ? ` ${event.detail}` : "";
|
||||
logVoiceVerbose(`realtime ${event.direction}:${event.type}${detail}`);
|
||||
const interruptionLog = formatRealtimeInterruptionLog(event);
|
||||
if (interruptionLog) {
|
||||
logger.info(interruptionLog);
|
||||
}
|
||||
},
|
||||
onError: (error) =>
|
||||
logger.warn(`discord voice: realtime error: ${formatErrorMessage(error)}`),
|
||||
|
||||
@@ -881,6 +881,98 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
|
||||
});
|
||||
|
||||
it("does not send duplicate response.cancel while cancellation is pending", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onEvent = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onEvent,
|
||||
});
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
await connecting;
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "response.audio.delta",
|
||||
item_id: "item_1",
|
||||
delta: Buffer.from("assistant audio").toString("base64"),
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
bridge.handleBargeIn?.({ audioPlaybackActive: true });
|
||||
bridge.handleBargeIn?.({ audioPlaybackActive: true });
|
||||
|
||||
expect(parseSent(socket).filter((event) => event.type === "response.cancel")).toHaveLength(1);
|
||||
expect(onEvent).toHaveBeenCalledWith({
|
||||
direction: "client",
|
||||
type: "response.cancel",
|
||||
detail: "reason=barge-in",
|
||||
});
|
||||
expect(onEvent).toHaveBeenCalledWith({
|
||||
direction: "client",
|
||||
type: "conversation.item.truncate",
|
||||
detail: "reason=barge-in audioEndMs=0",
|
||||
});
|
||||
});
|
||||
|
||||
it("drains deferred response.create after a no-active-response cancellation error", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onError = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onError,
|
||||
});
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
await connecting;
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
|
||||
);
|
||||
|
||||
bridge.submitToolResult("call_1", { text: "done" });
|
||||
bridge.handleBargeIn?.({ audioPlaybackActive: true });
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "error",
|
||||
error: {
|
||||
message: "Cancellation failed: no active response found",
|
||||
},
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(onError).not.toHaveBeenCalled();
|
||||
expect(parseSent(socket).slice(-1)).toEqual([{ type: "response.create" }]);
|
||||
});
|
||||
|
||||
it("resets deferred response guards after websocket reconnect", async () => {
|
||||
vi.useFakeTimers();
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
|
||||
@@ -82,6 +82,8 @@ const OPENAI_REALTIME_DEFAULT_MODEL = "gpt-realtime-2";
|
||||
const OPENAI_REALTIME_INPUT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
|
||||
const OPENAI_REALTIME_ACTIVE_RESPONSE_ERROR_PREFIX =
|
||||
"Conversation already has an active response in progress:";
|
||||
const OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR =
|
||||
"Cancellation failed: no active response found";
|
||||
|
||||
type RealtimeEvent = {
|
||||
type: string;
|
||||
@@ -791,6 +793,12 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.responseCreatePending = true;
|
||||
return;
|
||||
}
|
||||
if (detail === OPENAI_REALTIME_NO_ACTIVE_RESPONSE_CANCEL_ERROR) {
|
||||
this.responseActive = false;
|
||||
this.responseCancelInFlight = false;
|
||||
this.flushPendingResponseCreate();
|
||||
return;
|
||||
}
|
||||
this.config.onError?.(new Error(detail));
|
||||
return;
|
||||
}
|
||||
@@ -807,18 +815,26 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
responseStartTimestamp !== null &&
|
||||
assistantItemId !== null &&
|
||||
(this.markQueue.length > 0 || options?.audioPlaybackActive === true);
|
||||
if (options?.audioPlaybackActive === true && this.responseActive) {
|
||||
this.sendEvent({ type: "response.cancel" });
|
||||
if (
|
||||
options?.audioPlaybackActive === true &&
|
||||
this.responseActive &&
|
||||
!this.responseCancelInFlight
|
||||
) {
|
||||
this.sendEvent({ type: "response.cancel" }, "reason=barge-in");
|
||||
this.responseCancelInFlight = true;
|
||||
}
|
||||
if (shouldInterruptProvider) {
|
||||
const elapsedMs = this.latestMediaTimestamp - responseStartTimestamp;
|
||||
this.sendEvent({
|
||||
type: "conversation.item.truncate",
|
||||
item_id: assistantItemId,
|
||||
content_index: 0,
|
||||
audio_end_ms: Math.max(0, elapsedMs),
|
||||
});
|
||||
const audioEndMs = Math.max(0, elapsedMs);
|
||||
this.sendEvent(
|
||||
{
|
||||
type: "conversation.item.truncate",
|
||||
item_id: assistantItemId,
|
||||
content_index: 0,
|
||||
audio_end_ms: audioEndMs,
|
||||
},
|
||||
`reason=barge-in audioEndMs=${audioEndMs}`,
|
||||
);
|
||||
this.config.onClearAudio();
|
||||
this.markQueue = [];
|
||||
this.lastAssistantItemId = null;
|
||||
@@ -862,13 +878,13 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.config.onMark?.(markName);
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
private sendEvent(event: unknown, detail?: string): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
const type =
|
||||
event && typeof event === "object" && typeof (event as { type?: unknown }).type === "string"
|
||||
? (event as { type: string }).type
|
||||
: "unknown";
|
||||
this.config.onEvent?.({ direction: "client", type });
|
||||
this.config.onEvent?.({ direction: "client", type, ...(detail ? { detail } : {}) });
|
||||
const payload = JSON.stringify(event);
|
||||
captureWsEvent({
|
||||
url: this.resolveConnectionParams().url,
|
||||
|
||||
Reference in New Issue
Block a user