From 88b983a7139e147f46aa12fef8b66794a27c96a6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 4 May 2026 00:17:50 +0100 Subject: [PATCH] fix: stabilize Google Meet realtime audio --- CHANGELOG.md | 1 + extensions/google-meet/index.test.ts | 270 ++++++++++++++++-- extensions/google-meet/src/cli.ts | 11 + extensions/google-meet/src/realtime-node.ts | 25 +- extensions/google-meet/src/realtime.ts | 91 +++++- extensions/google-meet/src/runtime.ts | 85 ++++-- .../google-meet/src/transports/chrome.ts | 102 +++---- .../google-meet/src/transports/types.ts | 18 ++ .../openai/realtime-voice-provider.test.ts | 97 +++++++ extensions/openai/realtime-voice-provider.ts | 47 ++- src/plugin-sdk/realtime-voice.ts | 1 + src/plugins/install.ts | 2 +- src/realtime-voice/provider-types.ts | 7 + src/realtime-voice/session-runtime.ts | 3 + 14 files changed, 640 insertions(+), 120 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77a05dc4d23..e9cb1c92e27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai - Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc. - Plugins/install: let trusted official `@openclaw/*` catalog installs recover when npm `latest` points at a prerelease by falling back to the newest stable version, or by allowing prerelease-only launch packages with a warning instead of making beta/development plugin sweeps fail at install time. Thanks @vincentkoc. +- Google Meet: grant Chrome media permissions against the actual Meet tab, start the local realtime audio bridge only after Meet joins, expose realtime transcripts in status/logs, and force explicit audio responses with current OpenAI realtime output-audio events so BlackHole capture does not keep the OpenClaw participant muted or silent. - Google Meet: use the local call-control microphone button instead of disabled remote participant mute buttons, and block realtime speech when the OpenClaw Meet microphone remains muted. - Google Meet: refresh realtime browser state during status and retry delayed speech after Meet finishes joining, so a just-opened in-call tab no longer leaves speech stuck behind stale `not-in-call` health. - Plugins/install: recover the install ledger from the managed npm root when `plugins/installs.json` is empty or partial, so reinstalling Discord and Codex no longer makes the other installed plugin disappear. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 3c6858aa864..48aa286180e 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -1865,6 +1865,206 @@ describe("google-meet plugin", () => { } }); + it("grants local Chrome Meet media permissions against the opened tab", async () => { + const callGatewayFromCli = mockLocalMeetBrowserRequest({ + inCall: true, + micMuted: false, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }); + const { methods } = setup({ + defaultMode: "realtime", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + }, + realtime: { introMessage: "" }, + }); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); + + await handler?.({ + params: { url: "https://meet.google.com/abc-defg-hij" }, + respond, + }); + + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(callGatewayFromCli).toHaveBeenCalledWith( + "browser.request", + expect.any(Object), + expect.objectContaining({ + method: "POST", + path: "/permissions/grant", + body: expect.objectContaining({ + origin: "https://meet.google.com", + permissions: ["audioCapture", "videoCapture"], + targetId: "local-meet-tab", + }), + }), + { progress: false }, + ); + }); + + it("starts the local realtime audio bridge after Meet is inspected", async () => { + const events: string[] = []; + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { + path?: string; + body?: { fn?: string; targetId?: string; url?: string }; + }; + events.push(`browser:${request.path}`); + if (request.path === "/tabs") { + return { tabs: [] }; + } + if (request.path === "/tabs/open") { + return { + targetId: "local-meet-tab", + title: "Meet", + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { + return { ok: true }; + } + if (request.path === "/act") { + return { + result: JSON.stringify({ + inCall: true, + micMuted: false, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }), + }; + } + throw new Error(`unexpected browser request path ${request.path}`); + }, + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup( + { + defaultMode: "realtime", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + }, + realtime: { introMessage: "" }, + }, + { + runCommandWithTimeoutHandler: async (argv) => { + events.push(`command:${argv.join(" ")}`); + return argv[0] === "/usr/sbin/system_profiler" + ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } + : { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); + + await handler?.({ + params: { url: "https://meet.google.com/abc-defg-hij" }, + respond, + }); + + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(events.indexOf("browser:/act")).toBeGreaterThan(-1); + expect(events.indexOf("command:bridge start")).toBeGreaterThan(events.indexOf("browser:/act")); + }); + + it("does not start the local realtime audio bridge while Meet admission is pending", async () => { + const events: string[] = []; + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { path?: string; body?: { targetId?: string; url?: string } }; + events.push(`browser:${request.path}`); + if (request.path === "/tabs") { + return { tabs: [] }; + } + if (request.path === "/tabs/open") { + return { + targetId: "local-meet-tab", + title: "Meet", + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { + return { ok: true }; + } + if (request.path === "/act") { + return { + result: JSON.stringify({ + inCall: false, + lobbyWaiting: true, + manualActionRequired: true, + manualActionReason: "meet-admission-required", + manualActionMessage: "Admit the OpenClaw browser participant in Google Meet.", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }), + }; + } + throw new Error(`unexpected browser request path ${request.path}`); + }, + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup( + { + defaultMode: "realtime", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + waitForInCallMs: 1, + }, + realtime: { introMessage: "" }, + }, + { + runCommandWithTimeoutHandler: async (argv) => { + events.push(`command:${argv.join(" ")}`); + return argv[0] === "/usr/sbin/system_profiler" + ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } + : { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); + + await handler?.({ + params: { url: "https://meet.google.com/abc-defg-hij" }, + respond, + }); + + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(events).toContain("browser:/act"); + expect(events).not.toContain("command:bridge start"); + }); + it("refreshes observe-only caption health when status is requested", async () => { let openedTab = false; let actCount = 0; @@ -2790,7 +2990,8 @@ describe("google-meet plugin", () => { chrome: { health: { inCall: true, - speechReady: true, + speechReady: false, + speechBlockedReason: "audio-bridge-unavailable", }, }, }, @@ -3239,21 +3440,7 @@ describe("google-meet plugin", () => { }); it("pipes Chrome command-pair audio through the realtime provider", async () => { - let callbacks: - | { - onAudio: (audio: Buffer) => void; - onClearAudio: () => void; - onMark?: (markName: string) => void; - onToolCall?: (event: { - itemId: string; - callId: string; - name: string; - args: unknown; - }) => void; - onReady?: () => void; - tools?: unknown[]; - } - | undefined; + let callbacks: Parameters[0] | undefined; const sendAudio = vi.fn(); const bridge = { supportsToolResultContinuation: true, @@ -3357,6 +3544,14 @@ describe("google-meet plugin", () => { callbacks?.onClearAudio(); callbacks?.onAudio(Buffer.from([6, 7])); callbacks?.onReady?.(); + callbacks?.onTranscript?.("assistant", "How can I help you?", true); + callbacks?.onTranscript?.("user", "Please summarize the launch.", true); + callbacks?.onEvent?.({ direction: "client", type: "response.create" }); + callbacks?.onEvent?.({ + direction: "server", + type: "response.done", + detail: "status=completed", + }); callbacks?.onToolCall?.({ itemId: "item-1", callId: "tool-call-1", @@ -3396,6 +3591,23 @@ describe("google-meet plugin", () => { audioOutputActive: true, lastInputBytes: 3, lastOutputBytes: 4, + realtimeTranscriptLines: 2, + lastRealtimeTranscriptRole: "user", + lastRealtimeTranscriptText: "Please summarize the launch.", + lastRealtimeEventType: "server:response.done", + lastRealtimeEventDetail: "status=completed", + recentRealtimeTranscript: [ + expect.objectContaining({ role: "assistant", text: "How can I help you?" }), + expect.objectContaining({ role: "user", text: "Please summarize the launch." }), + ], + recentRealtimeEvents: [ + expect.objectContaining({ direction: "client", type: "response.create" }), + expect.objectContaining({ + direction: "server", + type: "response.done", + detail: "status=completed", + }), + ], clearCount: 1, }); expect(callbacks).toMatchObject({ @@ -3545,20 +3757,7 @@ describe("google-meet plugin", () => { }); it("pipes paired-node command-pair audio through the realtime provider", async () => { - let callbacks: - | { - onAudio: (audio: Buffer) => void; - onClearAudio: () => void; - onToolCall?: (event: { - itemId: string; - callId: string; - name: string; - args: unknown; - }) => void; - onReady?: () => void; - tools?: unknown[]; - } - | undefined; + let callbacks: Parameters[0] | undefined; const sendAudio = vi.fn(); const bridge = { supportsToolResultContinuation: true, @@ -3633,6 +3832,12 @@ describe("google-meet plugin", () => { callbacks?.onAudio(Buffer.from([1, 2, 3])); callbacks?.onClearAudio(); callbacks?.onReady?.(); + callbacks?.onTranscript?.("assistant", "How can I help from the node?", true); + callbacks?.onEvent?.({ + direction: "server", + type: "response.done", + detail: "status=completed", + }); callbacks?.onToolCall?.({ itemId: "item-1", callId: "tool-call-1", @@ -3715,6 +3920,11 @@ describe("google-meet plugin", () => { audioOutputActive: true, lastInputBytes: 3, lastOutputBytes: 3, + realtimeTranscriptLines: 1, + lastRealtimeTranscriptRole: "assistant", + lastRealtimeTranscriptText: "How can I help from the node?", + lastRealtimeEventType: "server:response.done", + lastRealtimeEventDetail: "status=completed", clearCount: 1, }); diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index dde9d769653..59c9d88f11b 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -388,6 +388,17 @@ function writeDoctorStatus(status: Awaited = []; + const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; + const realtimeEvents: GoogleMeetRealtimeEventEntry[] = []; const stop = async () => { if (stopped) { @@ -148,11 +155,15 @@ export async function startNodeRealtimeAudioBridge(params: { }, onTranscript: (role, text, isFinal) => { if (isFinal) { - transcript.push({ role, text }); - if (transcript.length > 40) { - transcript.splice(0, transcript.length - 40); - } - params.logger.debug?.(`[google-meet] ${role}: ${text}`); + recordGoogleMeetRealtimeTranscript(transcript, role, text); + params.logger.info(`[google-meet] node realtime ${role}: ${text}`); + } + }, + onEvent: (event) => { + recordGoogleMeetRealtimeEvent(realtimeEvents, event); + if (event.type === "error" || event.type === "response.done") { + const detail = event.detail ? ` ${event.detail}` : ""; + params.logger.info(`[google-meet] node realtime ${event.direction}:${event.type}${detail}`); } }, onToolCall: (event, session) => { @@ -261,6 +272,8 @@ export async function startNodeRealtimeAudioBridge(params: { lastClearAt, lastInputBytes, lastOutputBytes, + ...getGoogleMeetRealtimeTranscriptHealth(transcript), + ...getGoogleMeetRealtimeEventHealth(realtimeEvents), consecutiveInputErrors, lastInputError, clearCount, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index f059528552e..a398f215e33 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -9,6 +9,7 @@ import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, resolveConfiguredRealtimeVoiceProvider, type RealtimeVoiceBridgeSession, + type RealtimeVoiceBridgeEvent, type RealtimeVoiceProviderConfig, type RealtimeVoiceProviderPlugin, } from "openclaw/plugin-sdk/realtime-voice"; @@ -55,6 +56,77 @@ type ResolvedRealtimeProvider = { providerConfig: RealtimeVoiceProviderConfig; }; +export type GoogleMeetRealtimeTranscriptEntry = { + at: string; + role: "user" | "assistant"; + text: string; +}; + +export function recordGoogleMeetRealtimeTranscript( + transcript: GoogleMeetRealtimeTranscriptEntry[], + role: "user" | "assistant", + text: string, +): GoogleMeetRealtimeTranscriptEntry { + const entry = { at: new Date().toISOString(), role, text }; + transcript.push(entry); + if (transcript.length > 40) { + transcript.splice(0, transcript.length - 40); + } + return entry; +} + +export function getGoogleMeetRealtimeTranscriptHealth( + transcript: GoogleMeetRealtimeTranscriptEntry[], +): Pick< + GoogleMeetChromeHealth, + | "realtimeTranscriptLines" + | "lastRealtimeTranscriptAt" + | "lastRealtimeTranscriptRole" + | "lastRealtimeTranscriptText" + | "recentRealtimeTranscript" +> { + const last = transcript.at(-1); + return { + realtimeTranscriptLines: transcript.length, + lastRealtimeTranscriptAt: last?.at, + lastRealtimeTranscriptRole: last?.role, + lastRealtimeTranscriptText: last?.text, + recentRealtimeTranscript: transcript.slice(-5), + }; +} + +export type GoogleMeetRealtimeEventEntry = RealtimeVoiceBridgeEvent & { + at: string; +}; + +export function recordGoogleMeetRealtimeEvent( + events: GoogleMeetRealtimeEventEntry[], + event: RealtimeVoiceBridgeEvent, +) { + events.push({ at: new Date().toISOString(), ...event }); + if (events.length > 40) { + events.splice(0, events.length - 40); + } +} + +export function getGoogleMeetRealtimeEventHealth( + events: GoogleMeetRealtimeEventEntry[], +): Pick< + GoogleMeetChromeHealth, + | "lastRealtimeEventAt" + | "lastRealtimeEventType" + | "lastRealtimeEventDetail" + | "recentRealtimeEvents" +> { + const last = events.at(-1); + return { + lastRealtimeEventAt: last?.at, + lastRealtimeEventType: last ? `${last.direction}:${last.type}` : undefined, + lastRealtimeEventDetail: last?.detail, + recentRealtimeEvents: events.slice(-10), + }; +} + function splitCommand(argv: string[]): { command: string; args: string[] } { const [command, ...args] = argv; if (!command) { @@ -312,7 +384,8 @@ export async function startCommandRealtimeAudioBridge(params: { fullConfig: params.fullConfig, providers: params.providers, }); - const transcript: Array<{ role: "user" | "assistant"; text: string }> = []; + const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; + const realtimeEvents: GoogleMeetRealtimeEventEntry[] = []; bridge = createRealtimeVoiceBridgeSession({ provider: resolved.provider, providerConfig: resolved.providerConfig, @@ -335,11 +408,15 @@ export async function startCommandRealtimeAudioBridge(params: { }, onTranscript: (role, text, isFinal) => { if (isFinal) { - transcript.push({ role, text }); - if (transcript.length > 40) { - transcript.splice(0, transcript.length - 40); - } - params.logger.debug?.(`[google-meet] ${role}: ${text}`); + recordGoogleMeetRealtimeTranscript(transcript, role, text); + params.logger.info(`[google-meet] realtime ${role}: ${text}`); + } + }, + onEvent: (event) => { + recordGoogleMeetRealtimeEvent(realtimeEvents, event); + if (event.type === "error" || event.type === "response.done") { + const detail = event.detail ? ` ${event.detail}` : ""; + params.logger.info(`[google-meet] realtime ${event.direction}:${event.type}${detail}`); } }, onToolCall: (event, session) => { @@ -414,6 +491,8 @@ export async function startCommandRealtimeAudioBridge(params: { lastInputBytes, lastOutputBytes, suppressedInputBytes, + ...getGoogleMeetRealtimeTranscriptHealth(transcript), + ...getGoogleMeetRealtimeEventHealth(realtimeEvents), lastClearAt, clearCount, bridgeClosed: stopped, diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index ea3972225b3..a953b4353ce 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -27,6 +27,11 @@ import { speakMeetViaVoiceCallGateway, } from "./voice-call-gateway.js"; +type ChromeAudioBridgeResult = NonNullable< + | Awaited>["audioBridge"] + | Awaited>["audioBridge"] +>; + function nowIso(): string { return new Date().toISOString(); } @@ -421,26 +426,9 @@ export class GoogleMeetRuntime { launched: result.launched, nodeId: "nodeId" in result ? result.nodeId : undefined, browserProfile: this.params.config.chrome.browserProfile, - audioBridge: result.audioBridge - ? { - type: result.audioBridge.type, - provider: - result.audioBridge.type === "command-pair" || - result.audioBridge.type === "node-command-pair" - ? result.audioBridge.providerId - : undefined, - } - : undefined, health: "browser" in result ? result.browser : undefined, }; - if ( - result.audioBridge?.type === "command-pair" || - result.audioBridge?.type === "node-command-pair" - ) { - this.#sessionStops.set(session.id, result.audioBridge.stop); - this.#sessionSpeakers.set(session.id, result.audioBridge.speak); - this.#sessionHealth.set(session.id, result.audioBridge.getHealth); - } + this.#attachChromeAudioBridge(session, result.audioBridge); session.notes.push( result.audioBridge ? transport === "chrome-node" @@ -558,6 +546,7 @@ export class GoogleMeetRuntime { return { found: true, spoken: true, session }; } await this.#refreshBrowserHealthForChromeSession(session); + await this.#ensureChromeRealtimeBridge(session); const speak = this.#sessionSpeakers.get(sessionId); if (!speak || session.state !== "active") { return { found: true, spoken: false, session }; @@ -579,7 +568,7 @@ export class GoogleMeetRuntime { async #speakWhenReady(session: GoogleMeetSession, instructions: string): Promise { let result = await this.speak(session.id, instructions); - if (result.spoken || !session.chrome?.audioBridge || session.transport === "twilio") { + if (result.spoken || session.transport === "twilio") { return result.spoken; } const waitMs = Math.min( @@ -825,6 +814,64 @@ export class GoogleMeetRuntime { this.#refreshSpeechReadiness(session); } + #attachChromeAudioBridge( + session: GoogleMeetSession, + audioBridge: ChromeAudioBridgeResult | undefined, + ) { + if (!session.chrome || !audioBridge) { + return; + } + session.chrome.audioBridge = { + type: audioBridge.type, + provider: + audioBridge.type === "command-pair" || audioBridge.type === "node-command-pair" + ? audioBridge.providerId + : undefined, + }; + if (audioBridge.type === "command-pair" || audioBridge.type === "node-command-pair") { + this.#sessionStops.set(session.id, audioBridge.stop); + this.#sessionSpeakers.set(session.id, audioBridge.speak); + this.#sessionHealth.set(session.id, audioBridge.getHealth); + } + } + + async #ensureChromeRealtimeBridge(session: GoogleMeetSession) { + if ( + session.mode !== "realtime" || + session.transport !== "chrome" || + session.state !== "active" || + !session.chrome || + session.chrome.audioBridge + ) { + return; + } + const health = session.chrome.health; + if ( + health?.inCall !== true || + health.micMuted === true || + health.manualActionRequired === true + ) { + return; + } + const result = await launchChromeMeet({ + runtime: this.params.runtime, + config: { + ...this.params.config, + chrome: { + ...this.params.config.chrome, + launch: false, + }, + }, + fullConfig: this.params.fullConfig, + meetingSessionId: session.id, + mode: session.mode, + url: session.url, + logger: this.params.logger, + }); + this.#attachChromeAudioBridge(session, result.audioBridge); + session.updatedAt = nowIso(); + } + #refreshSpeechReadiness(session: GoogleMeetSession) { const readiness = evaluateSpeechReadiness(session); if (readiness.ready) { diff --git a/extensions/google-meet/src/transports/chrome.ts b/extensions/google-meet/src/transports/chrome.ts index 32c66afb88e..10879988ca6 100644 --- a/extensions/google-meet/src/transports/chrome.ts +++ b/extensions/google-meet/src/transports/chrome.ts @@ -96,12 +96,10 @@ export async function launchChromeMeet(params: { | ({ type: "command-pair" } & ChromeRealtimeAudioBridgeHandle); browser?: GoogleMeetChromeHealth; }> { - let audioBridge: - | { type: "external-command" } - | ({ type: "command-pair" } & ChromeRealtimeAudioBridgeHandle) - | undefined; - - if (params.mode === "realtime") { + const checkRealtimeAudioPrerequisites = async () => { + if (params.mode !== "realtime") { + return; + } await assertBlackHole2chAvailable({ runtime: params.runtime, timeoutMs: Math.min(params.config.chrome.joinTimeoutMs, 10_000), @@ -118,7 +116,16 @@ export async function launchChromeMeet(params: { ); } } + }; + const startRealtimeAudioBridge = async (): Promise< + | { type: "external-command" } + | ({ type: "command-pair" } & ChromeRealtimeAudioBridgeHandle) + | undefined + > => { + if (params.mode !== "realtime") { + return undefined; + } if (params.config.chrome.audioBridgeCommand) { const bridge = await params.runtime.system.runCommandWithTimeout( params.config.chrome.audioBridgeCommand, @@ -129,55 +136,46 @@ export async function launchChromeMeet(params: { `failed to start Chrome audio bridge: ${bridge.stderr || bridge.stdout || bridge.code}`, ); } - audioBridge = { type: "external-command" }; - } else { - if (!params.config.chrome.audioInputCommand || !params.config.chrome.audioOutputCommand) { - throw new Error( - "Chrome realtime mode requires chrome.audioInputCommand and chrome.audioOutputCommand, or chrome.audioBridgeCommand for an external bridge.", - ); - } - audioBridge = { - type: "command-pair", - ...(await startCommandRealtimeAudioBridge({ - config: params.config, - fullConfig: params.fullConfig, - runtime: params.runtime, - meetingSessionId: params.meetingSessionId, - inputCommand: params.config.chrome.audioInputCommand, - outputCommand: params.config.chrome.audioOutputCommand, - logger: params.logger, - })), - }; + return { type: "external-command" }; } - } - - if (!params.config.chrome.launch) { - return { launched: false, audioBridge }; - } - - let commandPairBridgeStopped = false; - const stopCommandPairBridge = async () => { - if (commandPairBridgeStopped) { - return; - } - commandPairBridgeStopped = true; - if (audioBridge?.type === "command-pair") { - await audioBridge.stop(); + if (!params.config.chrome.audioInputCommand || !params.config.chrome.audioOutputCommand) { + throw new Error( + "Chrome realtime mode requires chrome.audioInputCommand and chrome.audioOutputCommand, or chrome.audioBridgeCommand for an external bridge.", + ); } + return { + type: "command-pair", + ...(await startCommandRealtimeAudioBridge({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + meetingSessionId: params.meetingSessionId, + inputCommand: params.config.chrome.audioInputCommand, + outputCommand: params.config.chrome.audioOutputCommand, + logger: params.logger, + })), + }; }; - try { - const result = await openMeetWithBrowserRequest({ - callBrowser: callLocalBrowserRequest, - config: params.config, - mode: params.mode, - url: params.url, - }); - return { ...result, audioBridge }; - } catch (error) { - await stopCommandPairBridge(); - throw error; + await checkRealtimeAudioPrerequisites(); + + if (!params.config.chrome.launch) { + return { launched: false, audioBridge: await startRealtimeAudioBridge() }; } + + const result = await openMeetWithBrowserRequest({ + callBrowser: callLocalBrowserRequest, + config: params.config, + mode: params.mode, + url: params.url, + }); + const shouldStartRealtimeBridge = + params.mode === "realtime" && + result.browser?.inCall === true && + result.browser.micMuted !== true && + result.browser.manualActionRequired !== true; + const audioBridge = shouldStartRealtimeBridge ? await startRealtimeAudioBridge() : undefined; + return { ...result, audioBridge }; } function parseNodeStartResult(raw: unknown): { @@ -296,6 +294,7 @@ async function grantMeetMediaPermissions(params: { callBrowser: BrowserRequestCaller; timeoutMs: number; allowMicrophone: boolean; + targetId: string; }): Promise { if (!params.allowMicrophone) { return ["Observe-only mode skips Meet microphone/camera permission grants."]; @@ -308,6 +307,7 @@ async function grantMeetMediaPermissions(params: { origin: "https://meet.google.com", permissions: ["audioCapture", "videoCapture"], optionalPermissions: ["speakerSelection"], + targetId: params.targetId, timeoutMs: Math.min(params.timeoutMs, 5_000), }, timeoutMs: Math.min(params.timeoutMs, 5_000), @@ -611,6 +611,7 @@ async function openMeetWithBrowserRequest(params: { const permissionNotes = await grantMeetMediaPermissions({ allowMicrophone: params.mode === "realtime", callBrowser: params.callBrowser, + targetId, timeoutMs, }); const deadline = Date.now() + Math.max(0, params.config.chrome.waitForInCallMs); @@ -703,6 +704,7 @@ async function inspectRecoverableMeetTab(params: { : await grantMeetMediaPermissions({ allowMicrophone, callBrowser: params.callBrowser, + targetId: params.targetId, timeoutMs: params.timeoutMs, }); const evaluated = await params.callBrowser({ diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 6ba7da5bf4b..8a391568eec 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -43,6 +43,24 @@ export type GoogleMeetChromeHealth = { speaker?: string; text: string; }>; + realtimeTranscriptLines?: number; + lastRealtimeTranscriptAt?: string; + lastRealtimeTranscriptRole?: "user" | "assistant"; + lastRealtimeTranscriptText?: string; + recentRealtimeTranscript?: Array<{ + at: string; + role: "user" | "assistant"; + text: string; + }>; + lastRealtimeEventAt?: string; + lastRealtimeEventType?: string; + lastRealtimeEventDetail?: string; + recentRealtimeEvents?: Array<{ + at: string; + direction: "client" | "server"; + type: string; + detail?: string; + }>; manualActionRequired?: boolean; manualActionReason?: GoogleMeetManualActionReason; manualActionMessage?: string; diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index b5554b65b05..bae2b6fda80 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -457,4 +457,101 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { audio_end_ms: 240, }); }); + + it("forwards current realtime output audio events", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const onTranscript = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio, + onClearAudio: vi.fn(), + onTranscript, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + await connecting; + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + + const audio = Buffer.from("assistant audio"); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.output_audio.delta", + item_id: "item_1", + delta: audio.toString("base64"), + }), + ), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.output_audio_transcript.done", + transcript: "hello from current realtime events", + }), + ), + ); + + expect(onAudio).toHaveBeenCalledWith(audio); + expect(onTranscript).toHaveBeenCalledWith( + "assistant", + "hello from current realtime events", + true, + ); + }); + + it("creates an explicit user item and audio response for manual speech", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onEvent = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio: vi.fn(), + onClearAudio: vi.fn(), + onEvent, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + await connecting; + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + + bridge.triggerGreeting?.("Say exactly: hello from explicit speech."); + + expect(parseSent(socket).slice(-2)).toEqual([ + { + type: "conversation.item.create", + item: { + type: "message", + role: "user", + content: [ + { + type: "input_text", + text: "Say exactly: hello from explicit speech.", + }, + ], + }, + }, + { + type: "response.create", + response: { + output_modalities: ["audio", "text"], + }, + }, + ]); + expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "conversation.item.create" }); + expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "response.create" }); + }); }); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index c0556eb9594..6708897c371 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -85,6 +85,7 @@ type RealtimeEvent = { response?: { id?: string; status?: string; + status_details?: unknown; }; error?: unknown; }; @@ -265,19 +266,19 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { content: [{ type: "input_text", text }], }, }); - this.sendEvent({ type: "response.create" }); + this.sendEvent({ + type: "response.create", + response: { + output_modalities: ["audio", "text"], + }, + }); } triggerGreeting(instructions?: string): void { if (!this.isConnected() || !this.ws) { return; } - this.sendEvent({ - type: "response.create", - response: { - instructions: instructions ?? this.config.instructions, - }, - }); + this.sendUserMessage(instructions ?? this.config.instructions ?? "Greet the meeting."); } submitToolResult(callId: string, result: unknown): void { @@ -545,6 +546,11 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } private handleEvent(event: RealtimeEvent): void { + this.config.onEvent?.({ + direction: "server", + type: event.type, + detail: this.describeServerEvent(event), + }); switch (event.type) { case "session.created": return; @@ -564,7 +570,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.responseActive = true; return; - case "response.audio.delta": { + case "response.audio.delta": + case "response.output_audio.delta": { if (!event.delta) { return; } @@ -586,12 +593,14 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { return; case "response.audio_transcript.delta": + case "response.output_audio_transcript.delta": if (event.delta) { this.config.onTranscript?.("assistant", event.delta, false); } return; case "response.audio_transcript.done": + case "response.output_audio_transcript.done": if (event.transcript) { this.config.onTranscript?.("assistant", event.transcript, true); } @@ -698,6 +707,11 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { private sendEvent(event: unknown): void { if (this.ws?.readyState === WebSocket.OPEN) { + const type = + event && typeof event === "object" && typeof (event as { type?: unknown }).type === "string" + ? (event as { type: string }).type + : "unknown"; + this.config.onEvent?.({ direction: "client", type }); const payload = JSON.stringify(event); captureWsEvent({ url: this.resolveConnectionParams().url, @@ -713,6 +727,23 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.ws.send(payload); } } + + private describeServerEvent(event: RealtimeEvent): string | undefined { + if (event.type === "error") { + return readRealtimeErrorDetail(event.error); + } + if (event.type === "response.done") { + const status = event.response?.status; + const details = + event.response?.status_details === undefined + ? undefined + : JSON.stringify(event.response.status_details); + return ( + [status ? `status=${status}` : undefined, details].filter(Boolean).join(" ") || undefined + ); + } + return undefined; + } } function readStringField(value: unknown, key: string): string | undefined { diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index ceb63d0283c..d88b6d34b7b 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -4,6 +4,7 @@ export type { RealtimeVoiceBargeInOptions, RealtimeVoiceBridge, RealtimeVoiceBridgeCallbacks, + RealtimeVoiceBridgeEvent, RealtimeVoiceBrowserSession, RealtimeVoiceBrowserSessionCreateRequest, RealtimeVoiceBridgeCreateRequest, diff --git a/src/plugins/install.ts b/src/plugins/install.ts index 7896926487f..c5c6bb55a79 100644 --- a/src/plugins/install.ts +++ b/src/plugins/install.ts @@ -208,7 +208,7 @@ async function resolveTrustedOfficialPrereleaseResolution(params: { ); const stableVersion = semverVersions .filter((value) => !isPrereleaseSemverVersion(value)) - .sort(compareStableSemver) + .toSorted(compareStableSemver) .at(-1); if (!stableVersion) { if (semverVersions.length > 0 && semverVersions.every(isPrereleaseSemverVersion)) { diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index 51fd6a1b20c..5e2f399cf5a 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -52,11 +52,18 @@ export type RealtimeVoiceToolResultOptions = { willContinue?: boolean; }; +export type RealtimeVoiceBridgeEvent = { + direction: "client" | "server"; + type: string; + detail?: string; +}; + export type RealtimeVoiceBridgeCallbacks = { onAudio: (audio: Buffer) => void; onClearAudio: () => void; onMark?: (markName: string) => void; onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void; + onEvent?: (event: RealtimeVoiceBridgeEvent) => void; onToolCall?: (event: RealtimeVoiceToolCallEvent) => void; onReady?: () => void; onError?: (error: Error) => void; diff --git a/src/realtime-voice/session-runtime.ts b/src/realtime-voice/session-runtime.ts index 44535f5cd82..fb65a6cec20 100644 --- a/src/realtime-voice/session-runtime.ts +++ b/src/realtime-voice/session-runtime.ts @@ -4,6 +4,7 @@ import type { RealtimeVoiceAudioFormat, RealtimeVoiceBargeInOptions, RealtimeVoiceCloseReason, + RealtimeVoiceBridgeEvent, RealtimeVoiceProviderConfig, RealtimeVoiceRole, RealtimeVoiceTool, @@ -44,6 +45,7 @@ export type RealtimeVoiceBridgeSessionParams = { triggerGreetingOnReady?: boolean; tools?: RealtimeVoiceTool[]; onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void; + onEvent?: (event: RealtimeVoiceBridgeEvent) => void; onToolCall?: (event: RealtimeVoiceToolCallEvent, session: RealtimeVoiceBridgeSession) => void; onReady?: (session: RealtimeVoiceBridgeSession) => void; onError?: (error: Error) => void; @@ -104,6 +106,7 @@ export function createRealtimeVoiceBridgeSession( } }, onTranscript: params.onTranscript, + onEvent: params.onEvent, onToolCall: (event) => { if (!bridge) { return;