diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc1c433160..13b6fbbba8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427. - Providers/OpenAI: resolve `keychain::` `OPENAI_API_KEY` refs before creating OpenAI Realtime browser sessions or voice bridges, with a bounded cached Keychain lookup. Fixes #72120. Thanks @ctbritt. - Discord/gateway: reconnect when the gateway socket closes while waiting for the shared IDENTIFY concurrency window, instead of silently skipping IDENTIFY and leaving the bot online but unresponsive. Fixes #74617. Thanks @zeeskdr-ai. - Voice Call: add `sessionScope: "per-call"` for fresh per-call agent memory while preserving the default per-phone caller history. Fixes #45280. Thanks @pondcountry. diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 56541e8f8e3..197cfb7e209 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -166,6 +166,27 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { }), }), ); + const request = fetchWithSsrFGuardMock.mock.calls[0]?.[0] as + | { init?: { body?: string } } + | undefined; + const body = JSON.parse(request?.init?.body ?? "{}") as { + session?: { + audio?: { + input?: { + turn_detection?: Record; + transcription?: Record; + }; + }; + }; + }; + expect(body.session?.audio?.input).toEqual({ + turn_detection: { + type: "server_vad", + create_response: true, + interrupt_response: true, + }, + transcription: { model: "whisper-1" }, + }); expect(session).toMatchObject({ provider: "openai", transport: "webrtc-sdp", diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index e319e8b4b26..b2deeb9e054 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -753,6 +753,14 @@ async function createOpenAIRealtimeBrowserSession( model, instructions: req.instructions, audio: { + input: { + turn_detection: { + type: "server_vad", + create_response: true, + interrupt_response: true, + }, + transcription: { model: "whisper-1" }, + }, output: { voice }, }, }; diff --git a/src/gateway/control-ui-csp.test.ts b/src/gateway/control-ui-csp.test.ts index 0a8cd209ed2..5467a297306 100644 --- a/src/gateway/control-ui-csp.test.ts +++ b/src/gateway/control-ui-csp.test.ts @@ -17,6 +17,18 @@ describe("buildControlUiCspHeader", () => { expect(csp).toContain("font-src 'self' https://fonts.gstatic.com"); }); + it("allows OpenAI realtime WebRTC offer requests without allowing all HTTPS", () => { + const csp = buildControlUiCspHeader(); + const connectSrc = csp.split("; ").find((directive) => directive.startsWith("connect-src ")); + expect(connectSrc?.split(" ")).toEqual([ + "connect-src", + "'self'", + "ws:", + "wss:", + "https://api.openai.com", + ]); + }); + it("limits image loading to same-origin, data, and managed blob URLs", () => { const csp = buildControlUiCspHeader(); expect(csp).toContain("img-src 'self' data: blob:"); diff --git a/src/gateway/control-ui-csp.ts b/src/gateway/control-ui-csp.ts index 08caa928167..c4eb75893a1 100644 --- a/src/gateway/control-ui-csp.ts +++ b/src/gateway/control-ui-csp.ts @@ -47,6 +47,6 @@ export function buildControlUiCspHeader(opts?: { inlineScriptHashes?: string[] } "img-src 'self' data: blob:", "font-src 'self' https://fonts.gstatic.com", "worker-src 'self'", - "connect-src 'self' ws: wss:", + "connect-src 'self' ws: wss: https://api.openai.com", ].join("; "); } diff --git a/ui/src/ui/chat/realtime-talk-webrtc.ts b/ui/src/ui/chat/realtime-talk-webrtc.ts index 18060b96ec4..88bdf7ad506 100644 --- a/ui/src/ui/chat/realtime-talk-webrtc.ts +++ b/ui/src/ui/chat/realtime-talk-webrtc.ts @@ -14,6 +14,11 @@ type RealtimeServerEvent = { delta?: string; transcript?: string; arguments?: string; + error?: unknown; + response?: { + status?: string; + status_details?: unknown; + }; }; type ToolBuffer = { @@ -133,11 +138,42 @@ export class WebRtcSdpRealtimeTalkTransport implements RealtimeTalkTransport { case "response.function_call_arguments.done": void this.handleToolCall(event); return; + case "input_audio_buffer.speech_started": + this.ctx.callbacks.onStatus?.("listening", "Speech detected"); + return; + case "input_audio_buffer.speech_stopped": + this.ctx.callbacks.onStatus?.("thinking", "Processing speech"); + return; + case "response.created": + this.ctx.callbacks.onStatus?.("thinking", "Generating response"); + return; + case "response.done": + this.ctx.callbacks.onStatus?.("listening", this.extractResponseStatus(event)); + return; + case "error": + this.ctx.callbacks.onStatus?.("error", this.extractErrorDetail(event.error)); + return; default: return; } } + private extractResponseStatus(event: RealtimeServerEvent): string | undefined { + const status = event.response?.status; + return status && status !== "completed" ? `Response ${status}` : undefined; + } + + private extractErrorDetail(error: unknown): string { + if (!error || typeof error !== "object") { + return "Realtime provider error"; + } + const record = error as Record; + const message = typeof record.message === "string" ? record.message.trim() : ""; + const code = typeof record.code === "string" ? record.code.trim() : ""; + const type = typeof record.type === "string" ? record.type.trim() : ""; + return message || code || type || "Realtime provider error"; + } + private bufferToolDelta(event: RealtimeServerEvent): void { const key = event.item_id ?? "unknown"; const existing = this.toolBuffers.get(key); diff --git a/ui/src/ui/realtime-talk-webrtc.test.ts b/ui/src/ui/realtime-talk-webrtc.test.ts index ec60165586c..32e5b1a288b 100644 --- a/ui/src/ui/realtime-talk-webrtc.test.ts +++ b/ui/src/ui/realtime-talk-webrtc.test.ts @@ -11,12 +11,19 @@ class FakeDataChannel extends EventTarget { } class FakePeerConnection extends EventTarget { + static instances: FakePeerConnection[] = []; + connectionState: RTCPeerConnectionState = "new"; readonly channel = new FakeDataChannel(); readonly addTrack = vi.fn(); localDescription: RTCSessionDescriptionInit | null = null; remoteDescription: RTCSessionDescriptionInit | null = null; + constructor() { + super(); + FakePeerConnection.instances.push(this); + } + createDataChannel(): RTCDataChannel { return this.channel as unknown as RTCDataChannel; } @@ -44,6 +51,7 @@ describe("WebRtcSdpRealtimeTalkTransport", () => { }); beforeEach(() => { + FakePeerConnection.instances = []; const track = { stop: vi.fn() } as unknown as MediaStreamTrack; const stream = { getAudioTracks: () => [track], @@ -93,4 +101,75 @@ describe("WebRtcSdpRealtimeTalkTransport", () => { }); transport.stop(); }); + + it("surfaces realtime provider errors from the OpenAI data channel", async () => { + vi.stubGlobal( + "fetch", + vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch, + ); + const onStatus = vi.fn(); + const transport = new WebRtcSdpRealtimeTalkTransport( + { + provider: "openai", + transport: "webrtc-sdp", + clientSecret: "client-secret-123", + }, + { + client: {} as never, + sessionKey: "main", + callbacks: { onStatus }, + }, + ); + + await transport.start(); + const peer = FakePeerConnection.instances[0]; + peer?.channel.dispatchEvent( + new MessageEvent("message", { + data: JSON.stringify({ + type: "error", + error: { message: "Realtime model rejected the session" }, + }), + }), + ); + + expect(onStatus).toHaveBeenCalledWith("error", "Realtime model rejected the session"); + transport.stop(); + }); + + it("surfaces speech and response lifecycle status from the OpenAI data channel", async () => { + vi.stubGlobal( + "fetch", + vi.fn(async () => new Response("answer-sdp")) as unknown as typeof fetch, + ); + const onStatus = vi.fn(); + const transport = new WebRtcSdpRealtimeTalkTransport( + { + provider: "openai", + transport: "webrtc-sdp", + clientSecret: "client-secret-123", + }, + { + client: {} as never, + sessionKey: "main", + callbacks: { onStatus }, + }, + ); + + await transport.start(); + const peer = FakePeerConnection.instances[0]; + for (const type of [ + "input_audio_buffer.speech_started", + "input_audio_buffer.speech_stopped", + "response.created", + "response.done", + ]) { + peer?.channel.dispatchEvent(new MessageEvent("message", { data: JSON.stringify({ type }) })); + } + + expect(onStatus).toHaveBeenCalledWith("listening", "Speech detected"); + expect(onStatus).toHaveBeenCalledWith("thinking", "Processing speech"); + expect(onStatus).toHaveBeenCalledWith("thinking", "Generating response"); + expect(onStatus).toHaveBeenCalledWith("listening", undefined); + transport.stop(); + }); });