diff --git a/CHANGELOG.md b/CHANGELOG.md index 9299d92a84e..93dd24f54e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- OpenAI/Google Meet: wait for realtime voice `session.updated` before treating the bridge as connected, so Meet joins do not return with audio queued behind an unconfigured realtime session. Thanks @vincentkoc. - Plugins/catalog: merge official external catalog descriptors into partial package channel config metadata, so lagging WeCom/Yuanbao manifests keep their own schema while still exposing host-supplied labels and setup text. Thanks @vincentkoc. - Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc. - Plugins/install: let trusted official `@openclaw/*` catalog installs recover when npm `latest` points at a prerelease by falling back to the newest stable version, or by selecting the newest exact prerelease for prerelease-only launch packages with a warning instead of making beta/development plugin sweeps fail at install time. Thanks @vincentkoc. diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index bae2b6fda80..96224206501 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -328,6 +328,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { onReady, }); const connecting = bridge.connect(); + let connectResolved = false; + void connecting.then(() => { + connectResolved = true; + }); const socket = FakeWebSocket.instances[0]; if (!socket) { throw new Error("expected bridge to create a websocket"); @@ -335,11 +339,12 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); - await connecting; + await Promise.resolve(); bridge.sendAudio(Buffer.from("before-ready")); socket.emit("message", Buffer.from(JSON.stringify({ type: "session.created" }))); + expect(connectResolved).toBe(false); expect(onReady).not.toHaveBeenCalled(); expect(parseSent(socket).map((event) => event.type)).toEqual(["session.update"]); expect(parseSent(socket)[0]?.session).toMatchObject({ @@ -349,7 +354,9 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(bridge.isConnected()).toBe(false); socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + expect(connectResolved).toBe(true); expect(onReady).toHaveBeenCalledTimes(1); expect(parseSent(socket).map((event) => event.type)).toEqual([ "session.update", @@ -358,6 +365,35 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(bridge.isConnected()).toBe(true); }); + it("rejects connection when session configuration fails before readiness", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "error", + error: { message: "invalid realtime session" }, + }), + ), + ); + + await expect(connecting).rejects.toThrow("invalid realtime session"); + expect(bridge.isConnected()).toBe(false); + }); + it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const bridge = provider.createBridge({ @@ -375,6 +411,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); await connecting; expect(parseSent(socket)[0]?.session).toMatchObject({ @@ -425,8 +462,8 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); - await connecting; socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; bridge.setMediaTimestamp(1000); socket.emit( @@ -476,8 +513,8 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); - await connecting; socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; const audio = Buffer.from("assistant audio"); socket.emit( @@ -525,8 +562,8 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { socket.readyState = FakeWebSocket.OPEN; socket.emit("open"); - await connecting; socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; bridge.triggerGreeting?.("Say exactly: hello from explicit speech."); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 6708897c371..a28590c0737 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -343,7 +343,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { }); connectTimeout = setTimeout(() => { - if (!this.connected && !this.intentionallyClosed) { + if (!this.sessionConfigured && !this.intentionallyClosed) { this.ws?.terminate(); settleReject(new Error("OpenAI realtime connection timeout")); } @@ -364,7 +364,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { }, }); this.sendSessionUpdate(); - settleResolve(); }); this.ws.on("message", (data: Buffer) => { @@ -380,7 +379,14 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { }, }); try { - this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent); + const event = JSON.parse(data.toString()) as RealtimeEvent; + this.handleEvent(event); + if (event.type === "session.updated") { + settleResolve(); + } + if (event.type === "error" && !this.sessionConfigured) { + settleReject(new Error(readRealtimeErrorDetail(event.error))); + } } catch (error) { console.error("[openai] realtime event parse failed:", error); } @@ -398,7 +404,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { capability: "realtime-voice", }, }); - if (!this.connected) { + if (!this.sessionConfigured) { settleReject(error instanceof Error ? error : new Error(String(error))); } this.config.onError?.(error instanceof Error ? error : new Error(String(error)));