From 45cfe1dfa1a4a290fa5940924fbe8af75e1088cd Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 4 May 2026 02:48:45 +0100 Subject: [PATCH] feat(google-meet): default talk-back to agent mode --- CHANGELOG.md | 1 + docs/plugins/google-meet.md | 116 ++-- extensions/google-meet/index.test.ts | 574 ++++++++++++------ extensions/google-meet/index.ts | 25 +- extensions/google-meet/openclaw.plugin.json | 10 +- extensions/google-meet/src/cli.test.ts | 12 +- extensions/google-meet/src/cli.ts | 15 +- extensions/google-meet/src/config.ts | 14 +- extensions/google-meet/src/create.ts | 5 +- extensions/google-meet/src/node-host.ts | 7 +- extensions/google-meet/src/realtime-node.ts | 308 ++++++++++ extensions/google-meet/src/realtime.ts | 456 ++++++++++++++ extensions/google-meet/src/runtime.ts | 49 +- extensions/google-meet/src/setup.ts | 28 +- .../google-meet/src/transports/chrome.ts | 89 ++- 15 files changed, 1364 insertions(+), 345 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a95fa39f5ac..0aefe4a87c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ Docs: https://docs.openclaw.ai - Realtime transcription: report socket closes before provider readiness as closed-before-ready failures instead of mislabeling them as connection timeouts for OpenAI, xAI, and Deepgram streaming transcription. Thanks @vincentkoc. - OpenAI/Google Meet: fail realtime voice connection attempts when the socket closes before `session.updated`, avoiding stuck Meet joins waiting on a bridge that never became ready. Thanks @vincentkoc. - Google Meet: avoid treating repeated participant words as multiple assistant-overlap matches when suppressing realtime echo transcripts. Thanks @vincentkoc. +- Google Meet: make `mode: "agent"` the default Chrome talk-back path, using realtime transcription for input and regular OpenClaw TTS for speech output, while keeping direct realtime voice answers available as `mode: "bidi"` and accepting `mode: "realtime"` as an agent-mode compatibility alias. - QA/cache: require the full `CACHE-OK ` marker before live cache probes stop retrying, so suffix-only prose cannot hide a broken probe response. Thanks @vincentkoc. - Slack/Matrix: avoid creating blank progress-draft messages when `streaming.progress.label=false` and progress tool lines are disabled. Thanks @vincentkoc. - Slack/Discord: suppress standalone tool-progress chatter when partial preview streaming has `streaming.preview.toolProgress: false`, matching the documented quiet-preview behavior. Thanks @vincentkoc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index d90ed14ce44..ebf6b6ddefc 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -116,16 +116,16 @@ Or let an agent join through the `google_meet` tool: "action": "join", "url": "https://meet.google.com/abc-defg-hij", "transport": "chrome-node", - "mode": "realtime" + "mode": "agent" } ``` The agent-facing `google_meet` tool stays available on non-macOS hosts for artifact, calendar, setup, transcribe, Twilio, and `chrome-node` flows. Local -Chrome realtime actions are blocked there because the bundled realtime Chrome -audio path currently depends on macOS `BlackHole 2ch`. On Linux, use -`mode: "transcribe"`, Twilio dial-in, or a macOS `chrome-node` host for realtime -Chrome participation. +Chrome talk-back actions are blocked there because the bundled Chrome audio path +currently depends on macOS `BlackHole 2ch`. On Linux, use `mode: "transcribe"`, +Twilio dial-in, or a macOS `chrome-node` host for Chrome talk-back +participation. Create a new meeting and join it: @@ -395,7 +395,7 @@ Common failure checks: ## Install notes -The Chrome realtime default uses two external tools: +The Chrome talk-back default uses two external tools: - `sox`: command-line audio utility. The plugin uses explicit CoreAudio device commands for the default 24 kHz PCM16 audio bridge. @@ -970,9 +970,10 @@ Workspace Developer Preview Program for Meet media APIs. ## Config -The common Chrome realtime path only needs the plugin enabled, BlackHole, SoX, -and a backend realtime voice provider key. OpenAI is the default; set -`realtime.provider: "google"` to use Google Gemini Live: +The common Chrome agent path only needs the plugin enabled, BlackHole, SoX, a +realtime transcription provider key, and a configured OpenClaw TTS provider. +OpenAI is the default transcription provider; set `realtime.provider: "google"` +to use Google Gemini Live for `bidi` mode: ```bash brew install blackhole-2ch sox @@ -999,7 +1000,8 @@ Set the plugin config under `plugins.entries.google-meet.config`: Defaults: - `defaultTransport: "chrome"` -- `defaultMode: "realtime"` +- `defaultMode: "agent"` (`"realtime"` is accepted as a compatibility alias for + `"agent"`) - `chromeNode.node`: optional node id/name/IP for `chrome-node` - `chrome.audioBackend: "blackhole-2ch"` - `chrome.guestName: "OpenClaw Agent"`: name used on the signed-out Meet guest @@ -1027,13 +1029,16 @@ Defaults: interruption on `chrome.bargeInInputCommand` - `chrome.bargeInCooldownMs: 900`: minimum delay between repeated human interruption clears -- `realtime.strategy: "agent"`: default. Participant speech is transcribed, - sent to the configured OpenClaw agent in a per-meeting sub-agent session, and - the returned answer is spoken back through the realtime provider. -- `realtime.strategy: "bidi"`: direct bidirectional realtime model mode. The - realtime provider answers participant speech directly and may call +- `mode: "agent"`: default talk-back mode. Participant speech is transcribed by + the configured realtime transcription provider, sent to the configured + OpenClaw agent in a per-meeting sub-agent session, and spoken back through the + normal OpenClaw TTS runtime. +- `mode: "bidi"`: fallback direct bidirectional realtime model mode. The + realtime voice provider answers participant speech directly and may call `openclaw_agent_consult` for deeper/tool-backed answers. -- `realtime.provider: "openai"` +- `mode: "transcribe"`: observe-only mode without the talk-back bridge. +- `realtime.provider: "openai"`: provider id used by `agent` mode for realtime + transcription and by `bidi` mode for realtime voice. - `realtime.toolPolicy: "safe-read-only"` - `realtime.instructions`: brief spoken replies, with `openclaw_agent_consult` for deeper answers @@ -1077,8 +1082,8 @@ Optional overrides: chromeNode: { node: "parallels-macos", }, + defaultMode: "agent", realtime: { - strategy: "agent", provider: "google", agentId: "jay", toolPolicy: "owner", @@ -1124,23 +1129,25 @@ Agents can use the `google_meet` tool: "action": "join", "url": "https://meet.google.com/abc-defg-hij", "transport": "chrome-node", - "mode": "realtime" + "mode": "agent" } ``` Use `transport: "chrome"` when Chrome runs on the Gateway host. Use `transport: "chrome-node"` when Chrome runs on a paired node such as a Parallels -VM. In both cases the realtime model and `openclaw_agent_consult` run on the -Gateway host, so model credentials stay there. With the default -`realtime.strategy: "agent"`, the realtime provider handles audio and -transcription while the configured OpenClaw agent produces the spoken answer. -With `realtime.strategy: "bidi"`, the realtime model answers directly. +VM. In both cases the model providers and `openclaw_agent_consult` run on the +Gateway host, so model credentials stay there. With the default `mode: "agent"`, +the realtime transcription provider handles listening, the configured OpenClaw +agent produces the answer, and regular OpenClaw TTS speaks it into Meet. Use +`mode: "bidi"` when you want the realtime voice model to answer directly. +`mode: "realtime"` remains accepted as a compatibility alias for +`mode: "agent"`. Use `action: "status"` to list active sessions or inspect a session ID. Use `action: "speak"` with `sessionId` and `message` to make the realtime agent speak immediately. Use `action: "test_speech"` to create or reuse the session, trigger a known phrase, and return `inCall` health when the Chrome host can -report it. `test_speech` always forces `mode: "realtime"` and fails if asked to +report it. `test_speech` always forces `mode: "agent"` and fails if asked to run in `mode: "transcribe"` because observe-only sessions intentionally cannot emit speech. Its `speechOutputVerified` result is based on realtime audio output bytes increasing during this test call, so a reused session with older audio @@ -1172,38 +1179,38 @@ a session ended. } ``` -## Realtime agent consult +## Agent And Bidi Modes -Chrome realtime mode is optimized for a live voice loop. The realtime voice -provider hears the meeting audio and speaks through the configured audio bridge. -The default `realtime.strategy: "agent"` uses the realtime provider for audio -I/O and transcription, but routes final participant transcripts through the -configured OpenClaw agent before speaking. Set `realtime.strategy: "bidi"` when -you want the realtime model to answer directly. +Chrome `agent` mode is optimized for "my agent is in the meeting" behavior. The +realtime transcription provider hears the meeting audio, final participant +transcripts are routed through the configured OpenClaw agent, and the answer is +spoken through the normal OpenClaw TTS runtime. Set `mode: "bidi"` when you want +the realtime voice model to answer directly. Nearby final transcript fragments are coalesced before the consult so one spoken -turn does not produce several stale partial answers. -Realtime input is also suppressed while queued assistant audio is still playing, +turn does not produce several stale partial answers. Realtime input is also +suppressed while queued assistant audio is still playing, and recent assistant-like transcript echoes are ignored before the agent consult so BlackHole loopback does not make the agent answer its own speech. -| Strategy | Who decides the answer | Context behavior | Use when | -| -------- | ----------------------------- | ------------------------------------------------------------------------------------ | ----------------------------------------------------- | -| `agent` | The configured OpenClaw agent | Per-meeting sub-agent session plus normal agent policy, tools, workspace, and memory | You want "my agent is in the meeting" behavior | -| `bidi` | The realtime voice model | Realtime session context, with optional `openclaw_agent_consult` calls | You want the lowest-latency conversational voice loop | +| Mode | Who decides the answer | Speech output path | Use when | +| ------- | ----------------------------- | -------------------------------------- | ----------------------------------------------------- | +| `agent` | The configured OpenClaw agent | Normal OpenClaw TTS runtime | You want "my agent is in the meeting" behavior | +| `bidi` | The realtime voice model | Realtime voice provider audio response | You want the lowest-latency conversational voice loop | -In `bidi` strategy, when the realtime model needs deeper reasoning, current +In `bidi` mode, when the realtime model needs deeper reasoning, current information, or normal OpenClaw tools, it can call `openclaw_agent_consult`. The consult tool runs the regular OpenClaw agent behind the scenes with recent -meeting transcript context and returns a concise spoken answer to the realtime -voice session. The voice model can then speak that answer back into the meeting. -It uses the same shared realtime consult tool as Voice Call. +meeting transcript context and returns a concise spoken answer. In `agent` mode, +OpenClaw sends that answer directly to the TTS runtime; in `bidi` mode, the +realtime voice model can speak the consult result back into the meeting. It uses +the same shared consult machinery as Voice Call. By default, consults run against the `main` agent. Set `realtime.agentId` when a Meet lane should consult a dedicated OpenClaw agent workspace, model defaults, tool policy, memory, and session history. -Agent strategy consults use a per-meeting `agent::subagent:google-meet:` +Agent-mode consults use a per-meeting `agent::subagent:google-meet:` session key so follow-up questions keep meeting context while inheriting normal agent policy from the configured agent. @@ -1307,10 +1314,10 @@ The running agent only sees plugin tools registered by the current Gateway process. On non-macOS Gateway hosts, the agent-facing `google_meet` tool stays visible, -but local Chrome realtime actions are blocked before they hit the audio bridge. -Local Chrome realtime audio currently depends on macOS `BlackHole 2ch`, so +but local Chrome talk-back actions are blocked before they hit the audio bridge. +Local Chrome talk-back audio currently depends on macOS `BlackHole 2ch`, so Linux agents should use `mode: "transcribe"`, Twilio dial-in, or a macOS -`chrome-node` host instead of the default local Chrome realtime path. +`chrome-node` host instead of the default local Chrome agent path. ### No connected Google Meet-capable node @@ -1424,8 +1431,9 @@ openclaw googlemeet setup openclaw googlemeet doctor ``` -Use `mode: "realtime"` for listen/talk-back. `mode: "transcribe"` intentionally -does not start the duplex realtime voice bridge. For observe-only debugging, +Use `mode: "agent"` for the normal STT -> OpenClaw agent -> TTS talk-back path, +or `mode: "bidi"` for the direct realtime voice fallback. `mode: "transcribe"` +intentionally does not start the talk-back bridge. For observe-only debugging, run `openclaw googlemeet status --json ` after participants speak and check `captioning`, `transcriptLines`, and `lastCaptionText`. If `inCall` is true but `transcriptLines` stays at `0`, Meet captions may be disabled, no one @@ -1607,14 +1615,16 @@ call still needs a participant path. This plugin keeps that boundary visible: Chrome handles browser participation and local audio routing; Twilio handles phone dial-in participation. -Chrome realtime mode needs `BlackHole 2ch` plus either: +Chrome talk-back modes need `BlackHole 2ch` plus either: - `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the - realtime voice bridge and pipes audio in `chrome.audioFormat` between those - commands and the selected realtime voice provider. The default Chrome path is - 24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs. + bridge and pipes audio in `chrome.audioFormat` between those commands and the + selected provider. Agent mode uses realtime transcription plus regular TTS; + bidi mode uses the realtime voice provider. The default Chrome path is 24 kHz + PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs. - `chrome.audioBridgeCommand`: an external bridge command owns the whole local - audio path and must exit after starting or validating its daemon. + audio path and must exit after starting or validating its daemon. This is only + valid for `bidi` because `agent` mode needs direct command-pair access for TTS. For clean duplex audio, route Meet output and Meet microphone through separate virtual devices or a Loopback-style virtual device graph. A single shared @@ -1628,7 +1638,7 @@ Like `chrome.audioInputCommand` and `chrome.audioOutputCommand`, it is an operator-configured local command. Use an explicit trusted command path or argument list, and do not point it at scripts from untrusted locations. -`googlemeet speak` triggers the active realtime audio bridge for a Chrome +`googlemeet speak` triggers the active talk-back audio bridge for a Chrome session. `googlemeet leave` stops that bridge. For Twilio sessions delegated through the Voice Call plugin, `leave` also hangs up the underlying voice call. Use `googlemeet end-active-conference` when you also want to close the active diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 9508d37a0bc..1600fb49c53 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -4,6 +4,7 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { PassThrough, Writable } from "node:stream"; import { createContext, Script } from "node:vm"; +import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription"; import type { RealtimeVoiceProviderPlugin } from "openclaw/plugin-sdk/realtime-voice"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import plugin, { __testing as googleMeetPluginTesting } from "./index.js"; @@ -25,8 +26,10 @@ import { import { handleGoogleMeetNodeHostCommand } from "./src/node-host.js"; import { startNodeRealtimeAudioBridge } from "./src/realtime-node.js"; import { + convertGoogleMeetTtsAudioForBridge, extendGoogleMeetOutputEchoSuppression, isGoogleMeetLikelyAssistantEchoTranscript, + startCommandAgentAudioBridge, startCommandRealtimeAudioBridge, } from "./src/realtime.js"; import { GoogleMeetRuntime, normalizeMeetUrl } from "./src/runtime.js"; @@ -94,19 +97,6 @@ function setup( return harness; } -async function withProcessPlatform( - platform: NodeJS.Platform, - callback: () => Promise, -): Promise { - const originalPlatform = process.platform; - Object.defineProperty(process, "platform", { value: platform }); - try { - return await callback(); - } finally { - Object.defineProperty(process, "platform", { value: originalPlatform }); - } -} - function jsonResponse(value: unknown): Response { return new Response(JSON.stringify(value), { status: 200, @@ -324,13 +314,13 @@ describe("google-meet plugin", () => { googleMeetPluginTesting.setPlatformForTests(); }); - it("defaults to chrome realtime with safe read-only tools", () => { + it("defaults to chrome agent mode with safe read-only tools", () => { expect(resolveGoogleMeetConfig({})).toMatchObject({ enabled: true, defaults: {}, preview: { enrollmentAcknowledged: false }, defaultTransport: "chrome", - defaultMode: "realtime", + defaultMode: "agent", chrome: { audioBackend: "blackhole-2ch", launch: true, @@ -537,7 +527,7 @@ describe("google-meet plugin", () => { ); }); - it("keeps the agent tool visible on non-macOS hosts but blocks local Chrome realtime joins", async () => { + it("keeps the agent tool visible on non-macOS hosts but blocks local Chrome talk-back joins", async () => { const { cliRegistrations, methods, tools } = setup(undefined, { registerPlatform: "linux" }); const tool = tools[0] as { execute: (id: string, params: unknown) => Promise<{ isError?: boolean; content: unknown }>; @@ -555,7 +545,7 @@ describe("google-meet plugin", () => { ).toBe(true); const blocked = await tool.execute("id", { action: "join" }); - expect(JSON.stringify(blocked)).toContain("local Chrome realtime audio is macOS-only"); + expect(JSON.stringify(blocked)).toContain("local Chrome talk-back audio is macOS-only"); expect( googleMeetPluginTesting.isGoogleMeetAgentToolActionUnsupportedOnHost({ @@ -631,7 +621,7 @@ describe("google-meet plugin", () => { description: expect.stringContaining("recover_current_tab"), }, transport: { type: "string", enum: ["chrome", "chrome-node", "twilio"] }, - mode: { type: "string", enum: ["realtime", "transcribe"] }, + mode: { type: "string", enum: ["agent", "bidi", "realtime", "transcribe"] }, }, }); }); @@ -1077,7 +1067,7 @@ describe("google-meet plugin", () => { expect(result.details.session).toMatchObject({ transport: "twilio", - mode: "realtime", + mode: "agent", twilio: { dialInNumber: "+15551234567", pinProvided: true, @@ -1179,6 +1169,53 @@ describe("google-meet plugin", () => { } }); + it("rejects agent-mode external audio bridges in setup status", async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); + try { + const { tools } = setup( + { + defaultMode: "agent", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + audioInputCommand: ["capture-meet"], + audioOutputCommand: ["play-meet"], + }, + }, + { + runCommandWithTimeoutHandler: async (argv) => { + if (argv[0] === "/usr/sbin/system_profiler") { + return { code: 0, stdout: "BlackHole 2ch", stderr: "" }; + } + return { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const tool = tools[0] as { + execute: ( + id: string, + params: unknown, + ) => Promise<{ details: { ok?: boolean; checks?: unknown[] } }>; + }; + + const result = await tool.execute("id", { action: "setup_status" }); + + expect(result.details.ok).toBe(false); + expect(result.details.checks).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + id: "audio-bridge", + ok: false, + message: expect.stringContaining("chrome.audioBridgeCommand is bidi-only"), + }), + ]), + ); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } + }); + it("reports attendance through the tool", async () => { stubMeetArtifactsApi(); const { tools } = setup(); @@ -1894,209 +1931,223 @@ describe("google-meet plugin", () => { }); it("grants local Chrome Meet media permissions against the opened tab", async () => { - const callGatewayFromCli = mockLocalMeetBrowserRequest({ - inCall: true, - micMuted: false, - title: "Meet call", - url: "https://meet.google.com/abc-defg-hij", - }); - const { methods } = setup({ - defaultMode: "realtime", - defaultTransport: "chrome", - chrome: { - audioBridgeCommand: ["bridge", "start"], - }, - realtime: { introMessage: "" }, - }); - const handler = methods.get("googlemeet.join") as - | ((ctx: { - params: Record; - respond: ReturnType; - }) => Promise) - | undefined; - const respond = vi.fn(); + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); + try { + const callGatewayFromCli = mockLocalMeetBrowserRequest({ + inCall: true, + micMuted: false, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }); + const { methods } = setup({ + defaultMode: "bidi", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + }, + realtime: { introMessage: "" }, + }); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); - await withProcessPlatform("darwin", async () => { await handler?.({ params: { url: "https://meet.google.com/abc-defg-hij" }, respond, }); - }); - expect(respond.mock.calls[0]?.[0]).toBe(true); - expect(callGatewayFromCli).toHaveBeenCalledWith( - "browser.request", - expect.any(Object), - expect.objectContaining({ - method: "POST", - path: "/permissions/grant", - body: expect.objectContaining({ - origin: "https://meet.google.com", - permissions: ["audioCapture", "videoCapture"], - targetId: "local-meet-tab", + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(callGatewayFromCli).toHaveBeenCalledWith( + "browser.request", + expect.any(Object), + expect.objectContaining({ + method: "POST", + path: "/permissions/grant", + body: expect.objectContaining({ + origin: "https://meet.google.com", + permissions: ["audioCapture", "videoCapture"], + targetId: "local-meet-tab", + }), }), - }), - { progress: false }, - ); + { progress: false }, + ); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } }); it("starts the local realtime audio bridge after Meet is inspected", async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); const events: string[] = []; - const callGatewayFromCli = vi.fn( - async ( - _method: string, - _opts: unknown, - params?: unknown, - _extra?: unknown, - ): Promise> => { - const request = params as { - path?: string; - body?: { fn?: string; targetId?: string; url?: string }; - }; - events.push(`browser:${request.path}`); - if (request.path === "/tabs") { - return { tabs: [] }; - } - if (request.path === "/tabs/open") { - return { - targetId: "local-meet-tab", - title: "Meet", - url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + try { + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { + path?: string; + body?: { fn?: string; targetId?: string; url?: string }; }; - } - if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { - return { ok: true }; - } - if (request.path === "/act") { - return { - result: JSON.stringify({ - inCall: true, - micMuted: false, - title: "Meet call", - url: "https://meet.google.com/abc-defg-hij", - }), - }; - } - throw new Error(`unexpected browser request path ${request.path}`); - }, - ); - chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); - const { methods } = setup( - { - defaultMode: "realtime", - defaultTransport: "chrome", - chrome: { - audioBridgeCommand: ["bridge", "start"], + events.push(`browser:${request.path}`); + if (request.path === "/tabs") { + return { tabs: [] }; + } + if (request.path === "/tabs/open") { + return { + targetId: "local-meet-tab", + title: "Meet", + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { + return { ok: true }; + } + if (request.path === "/act") { + return { + result: JSON.stringify({ + inCall: true, + micMuted: false, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }), + }; + } + throw new Error(`unexpected browser request path ${request.path}`); }, - realtime: { introMessage: "" }, - }, - { - runCommandWithTimeoutHandler: async (argv) => { - events.push(`command:${argv.join(" ")}`); - return argv[0] === "/usr/sbin/system_profiler" - ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } - : { code: 0, stdout: "", stderr: "" }; + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup( + { + defaultMode: "bidi", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + }, + realtime: { introMessage: "" }, }, - }, - ); - const handler = methods.get("googlemeet.join") as - | ((ctx: { - params: Record; - respond: ReturnType; - }) => Promise) - | undefined; - const respond = vi.fn(); + { + runCommandWithTimeoutHandler: async (argv) => { + events.push(`command:${argv.join(" ")}`); + return argv[0] === "/usr/sbin/system_profiler" + ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } + : { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); - await withProcessPlatform("darwin", async () => { await handler?.({ params: { url: "https://meet.google.com/abc-defg-hij" }, respond, }); - }); - expect(respond.mock.calls[0]?.[0]).toBe(true); - expect(events.indexOf("browser:/act")).toBeGreaterThan(-1); - expect(events.indexOf("command:bridge start")).toBeGreaterThan(events.indexOf("browser:/act")); + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(events.indexOf("browser:/act")).toBeGreaterThan(-1); + expect(events.indexOf("command:bridge start")).toBeGreaterThan( + events.indexOf("browser:/act"), + ); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } }); it("does not start the local realtime audio bridge while Meet admission is pending", async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); const events: string[] = []; - const callGatewayFromCli = vi.fn( - async ( - _method: string, - _opts: unknown, - params?: unknown, - _extra?: unknown, - ): Promise> => { - const request = params as { path?: string; body?: { targetId?: string; url?: string } }; - events.push(`browser:${request.path}`); - if (request.path === "/tabs") { - return { tabs: [] }; - } - if (request.path === "/tabs/open") { - return { - targetId: "local-meet-tab", - title: "Meet", - url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", - }; - } - if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { - return { ok: true }; - } - if (request.path === "/act") { - return { - result: JSON.stringify({ - inCall: false, - lobbyWaiting: true, - manualActionRequired: true, - manualActionReason: "meet-admission-required", - manualActionMessage: "Admit the OpenClaw browser participant in Google Meet.", + try { + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { path?: string; body?: { targetId?: string; url?: string } }; + events.push(`browser:${request.path}`); + if (request.path === "/tabs") { + return { tabs: [] }; + } + if (request.path === "/tabs/open") { + return { + targetId: "local-meet-tab", title: "Meet", - url: "https://meet.google.com/abc-defg-hij", - }), - }; - } - throw new Error(`unexpected browser request path ${request.path}`); - }, - ); - chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); - const { methods } = setup( - { - defaultMode: "realtime", - defaultTransport: "chrome", - chrome: { - audioBridgeCommand: ["bridge", "start"], - waitForInCallMs: 1, + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { + return { ok: true }; + } + if (request.path === "/act") { + return { + result: JSON.stringify({ + inCall: false, + lobbyWaiting: true, + manualActionRequired: true, + manualActionReason: "meet-admission-required", + manualActionMessage: "Admit the OpenClaw browser participant in Google Meet.", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }), + }; + } + throw new Error(`unexpected browser request path ${request.path}`); }, - realtime: { introMessage: "" }, - }, - { - runCommandWithTimeoutHandler: async (argv) => { - events.push(`command:${argv.join(" ")}`); - return argv[0] === "/usr/sbin/system_profiler" - ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } - : { code: 0, stdout: "", stderr: "" }; + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup( + { + defaultMode: "bidi", + defaultTransport: "chrome", + chrome: { + audioBridgeCommand: ["bridge", "start"], + waitForInCallMs: 1, + }, + realtime: { introMessage: "" }, }, - }, - ); - const handler = methods.get("googlemeet.join") as - | ((ctx: { - params: Record; - respond: ReturnType; - }) => Promise) - | undefined; - const respond = vi.fn(); + { + runCommandWithTimeoutHandler: async (argv) => { + events.push(`command:${argv.join(" ")}`); + return argv[0] === "/usr/sbin/system_profiler" + ? { code: 0, stdout: "BlackHole 2ch", stderr: "" } + : { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const handler = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); - await withProcessPlatform("darwin", async () => { await handler?.({ params: { url: "https://meet.google.com/abc-defg-hij" }, respond, }); - }); - expect(respond.mock.calls[0]?.[0]).toBe(true); - expect(events).toContain("browser:/act"); - expect(events).not.toContain("command:bridge start"); + expect(respond.mock.calls[0]?.[0]).toBe(true); + expect(events).toContain("browser:/act"); + expect(events).not.toContain("command:bridge start"); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } }); it("refreshes observe-only caption health when status is requested", async () => { @@ -2220,7 +2271,7 @@ describe("google-meet plugin", () => { let openedTab = false; const { methods, nodesInvoke } = setup( { - defaultMode: "realtime", + defaultMode: "agent", defaultTransport: "chrome-node", }, { @@ -2462,7 +2513,7 @@ describe("google-meet plugin", () => { expect(result.micMuted).toBe(true); expect(localMic.click).toHaveBeenCalledTimes(1); expect(remoteMute.click).not.toHaveBeenCalled(); - expect(result.notes).toContain("Attempted to turn on the Meet microphone for realtime mode."); + expect(result.notes).toContain("Attempted to turn on the Meet microphone for talk-back mode."); }); it("blocks realtime speech while the Meet microphone remains muted", async () => { @@ -3098,7 +3149,7 @@ describe("google-meet plugin", () => { id: "meet_1", url: "https://meet.google.com/abc-defg-hij", transport: "chrome", - mode: "realtime", + mode: "agent", state: "active", createdAt: "2026-04-27T00:00:00.000Z", updatedAt: "2026-04-27T00:00:00.000Z", @@ -3123,7 +3174,7 @@ describe("google-meet plugin", () => { expect(join).toHaveBeenCalledWith( expect.objectContaining({ message: "Say exactly: hello.", - mode: "realtime", + mode: "agent", }), ); expect(speak).not.toHaveBeenCalled(); @@ -3145,7 +3196,7 @@ describe("google-meet plugin", () => { url: "https://meet.google.com/abc-defg-hij", mode: "transcribe", }), - ).rejects.toThrow("test_speech requires mode: realtime"); + ).rejects.toThrow("test_speech requires mode: agent or bidi"); }); it("rejects realtime and Twilio modes for test listen", async () => { @@ -3159,7 +3210,7 @@ describe("google-meet plugin", () => { await expect( runtime.testListen({ url: "https://meet.google.com/abc-defg-hij", - mode: "realtime", + mode: "agent", }), ).rejects.toThrow("test_listen requires mode: transcribe"); @@ -3240,7 +3291,7 @@ describe("google-meet plugin", () => { const { methods, nodesInvoke } = setup( { defaultTransport: "chrome-node", - defaultMode: "realtime", + defaultMode: "agent", }, { nodesInvokeHandler: async ({ command, params }) => { @@ -3437,6 +3488,7 @@ describe("google-meet plugin", () => { Object.defineProperty(process, "platform", { value: "darwin" }); try { const { methods, runCommandWithTimeout } = setup({ + defaultMode: "bidi", chrome: { audioBridgeHealthCommand: ["bridge", "status"], audioBridgeCommand: ["bridge", "start"], @@ -3478,6 +3530,136 @@ describe("google-meet plugin", () => { } }); + it("uses realtime transcription plus regular TTS in Chrome agent mode", async () => { + let callbacks: Parameters[0] | undefined; + const sendAudio = vi.fn(); + const sttSession = { + connect: vi.fn(async () => {}), + sendAudio, + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider: RealtimeTranscriptionProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 1, + resolveConfig: ({ rawConfig }) => rawConfig, + isConfigured: () => true, + createSession: (req) => { + callbacks = req; + return sttSession; + }, + }; + const inputStdout = new PassThrough(); + const outputStdinWrites: Buffer[] = []; + const makeProcess = (stdio: { + stdin?: { write(chunk: unknown): unknown } | null; + stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null; + }): TestBridgeProcess => { + const proc = new EventEmitter() as unknown as TestBridgeProcess; + proc.stdin = stdio.stdin; + proc.stdout = stdio.stdout; + proc.stderr = new PassThrough(); + proc.killed = false; + proc.kill = vi.fn(() => { + proc.killed = true; + return true; + }); + return proc; + }; + const outputStdin = new Writable({ + write(chunk, _encoding, done) { + outputStdinWrites.push(Buffer.from(chunk)); + done(); + }, + }); + const inputProcess = makeProcess({ stdout: inputStdout, stdin: null }); + const outputProcess = makeProcess({ stdin: outputStdin, stdout: null }); + const spawnMock = vi.fn().mockReturnValueOnce(outputProcess).mockReturnValueOnce(inputProcess); + const sessionStore: Record = {}; + const runtime = { + tts: { + textToSpeechTelephony: vi.fn(async () => ({ + success: true, + audioBuffer: Buffer.from([1, 0, 2, 0]), + sampleRate: 24_000, + })), + }, + agent: { + resolveAgentDir: vi.fn(() => "/tmp/agent"), + resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"), + ensureAgentWorkspace: vi.fn(async () => {}), + session: { + resolveStorePath: vi.fn(() => "/tmp/sessions.json"), + loadSessionStore: vi.fn(() => sessionStore), + saveSessionStore: vi.fn(async () => {}), + updateSessionStore: vi.fn(async (_storePath, mutator) => mutator(sessionStore as never)), + resolveSessionFilePath: vi.fn(() => "/tmp/session.json"), + }, + runEmbeddedPiAgent: vi.fn(async () => ({ + payloads: [{ text: "Use the Portugal launch data." }], + meta: {}, + })), + resolveAgentTimeoutMs: vi.fn(() => 1000), + }, + }; + + const handle = await startCommandAgentAudioBridge({ + config: resolveGoogleMeetConfig({ + realtime: { provider: "openai", agentId: "jay", introMessage: "" }, + }), + fullConfig: {} as never, + runtime: runtime as never, + meetingSessionId: "meet-1", + inputCommand: ["capture-meet"], + outputCommand: ["play-meet"], + logger: noopLogger, + providers: [provider], + spawn: spawnMock, + }); + + inputStdout.write(Buffer.from([1, 0, 2, 0, 3, 0, 4, 0])); + callbacks?.onTranscript?.("Please summarize the launch."); + await new Promise((resolve) => setTimeout(resolve, 1100)); + + expect(sendAudio).toHaveBeenCalledWith(expect.any(Buffer)); + expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalled(); + expect(runtime.tts.textToSpeechTelephony).toHaveBeenCalledWith({ + text: "Use the Portugal launch data.", + cfg: {}, + }); + expect(Buffer.concat(outputStdinWrites)).toEqual(Buffer.from([1, 0, 2, 0])); + expect(handle.getHealth()).toMatchObject({ + providerConnected: true, + audioInputActive: true, + audioOutputActive: true, + realtimeTranscriptLines: 2, + lastRealtimeTranscriptRole: "assistant", + }); + await handle.stop(); + }); + + it("preserves telephony TTS output formats when routing Google Meet agent audio", () => { + const ulaw = Buffer.from([0xff, 0x7f, 0x00]); + const pcmBridgeConfig = resolveGoogleMeetConfig({ chrome: { audioFormat: "pcm16-24khz" } }); + const ulawBridgeConfig = resolveGoogleMeetConfig({ chrome: { audioFormat: "g711-ulaw-8khz" } }); + + expect( + convertGoogleMeetTtsAudioForBridge(ulaw, 8_000, ulawBridgeConfig, "raw-8khz-8bit-mono-mulaw"), + ).toEqual(ulaw); + const pcmForMeet = convertGoogleMeetTtsAudioForBridge( + ulaw, + 8_000, + pcmBridgeConfig, + "ulaw_8000", + ); + expect(pcmForMeet.byteLength).toBe(18); + expect(pcmForMeet).not.toEqual(ulaw); + expect(() => + convertGoogleMeetTtsAudioForBridge(Buffer.from([1, 2, 3]), 8_000, pcmBridgeConfig, "mp3"), + ).toThrow("Unsupported telephony TTS output format"); + }); + it("pipes Chrome command-pair audio through the realtime provider", async () => { let callbacks: Parameters[0] | undefined; const sendAudio = vi.fn(); diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 1b1c9a8b9a5..c9c009372cf 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -52,7 +52,7 @@ const googleMeetConfigSchema = { }, defaultMode: { label: "Default Mode", - help: "Realtime starts the duplex voice model loop. Transcribe joins/observes without the realtime talk-back bridge.", + help: "Agent uses realtime transcription plus regular OpenClaw TTS. Bidi uses the realtime voice model directly. Transcribe observes only.", }, "chrome.audioBackend": { label: "Chrome Audio Backend", @@ -152,7 +152,7 @@ const googleMeetConfigSchema = { "voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true }, "realtime.strategy": { label: "Realtime Strategy", - help: "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly.", + help: "Legacy realtime alias setting. Use mode=agent or mode=bidi for new Meet joins.", }, "realtime.provider": { label: "Realtime Provider", @@ -238,9 +238,9 @@ const GoogleMeetToolSchema = Type.Object({ ), mode: Type.Optional( Type.String({ - enum: ["realtime", "transcribe"], + enum: ["agent", "bidi", "realtime", "transcribe"], description: - "Join mode. realtime starts live listen/talk-back through the realtime voice model; transcribe joins without the realtime talk-back bridge.", + "Join mode. agent uses realtime transcription, the configured OpenClaw agent, and regular TTS. bidi uses the realtime voice model directly. realtime is a compatibility alias for agent. transcribe joins observe-only.", }), ), dialInNumber: Type.Optional( @@ -328,7 +328,14 @@ function normalizeTransport(value: unknown): GoogleMeetTransport | undefined { } function normalizeMode(value: unknown): GoogleMeetMode | undefined { - return value === "realtime" || value === "transcribe" ? value : undefined; + if (value === "realtime") { + return "agent"; + } + return value === "agent" || value === "bidi" || value === "transcribe" ? value : undefined; +} + +function isGoogleMeetTalkBackMode(mode: GoogleMeetMode): boolean { + return mode === "agent" || mode === "bidi"; } function resolveMeetingInput(config: GoogleMeetConfig, value: unknown): string { @@ -418,9 +425,9 @@ function isGoogleMeetAgentToolActionUnsupportedOnHost(params: { const transport = normalizeTransport(params.raw.transport) ?? params.config.defaultTransport; const mode = action === "test_speech" - ? "realtime" + ? "agent" : (normalizeMode(params.raw.mode) ?? params.config.defaultMode); - return transport === "chrome" && mode === "realtime"; + return transport === "chrome" && isGoogleMeetTalkBackMode(mode); } function assertGoogleMeetAgentToolActionSupported(params: { @@ -431,7 +438,7 @@ function assertGoogleMeetAgentToolActionSupported(params: { return; } throw new Error( - "Google Meet local Chrome realtime audio is macOS-only. On this host, use mode: transcribe, transport: twilio, or transport: chrome-node backed by a macOS node.", + "Google Meet local Chrome talk-back audio is macOS-only. On this host, use mode: transcribe, transport: twilio, or transport: chrome-node backed by a macOS node.", ); } @@ -998,7 +1005,7 @@ export default definePluginEntry({ name: "google_meet", label: "Google Meet", description: - "Join and track Google Meet sessions through Chrome or Twilio. Call setup_status before join/create/test_listen/test_speech; if it reports a Chrome node offline, local audio missing, or missing Twilio dial plan, surface that blocker instead of retrying or switching transports. Twilio cannot dial a Meet URL directly: provide dialInNumber plus optional pin/dtmfSequence, or configure twilio.defaultDialInNumber. Offline nodes are diagnostics only, not usable candidates. If local Chrome realtime audio is unsupported on this OS, use mode=transcribe, transport=twilio, or a macOS chrome-node for realtime Chrome. If a Meet tab is already open after a timeout, call recover_current_tab before retrying join to report login, permission, or admission blockers without opening another tab.", + "Join and track Google Meet sessions through Chrome or Twilio. Call setup_status before join/create/test_listen/test_speech; if it reports a Chrome node offline, local audio missing, or missing Twilio dial plan, surface that blocker instead of retrying or switching transports. Twilio cannot dial a Meet URL directly: provide dialInNumber plus optional pin/dtmfSequence, or configure twilio.defaultDialInNumber. Offline nodes are diagnostics only, not usable candidates. If local Chrome talk-back audio is unsupported on this OS, use mode=transcribe, transport=twilio, or a macOS chrome-node for agent/bidi Chrome. If a Meet tab is already open after a timeout, call recover_current_tab before retrying join to report login, permission, or admission blockers without opening another tab.", parameters: GoogleMeetToolSchema, async execute(_toolCallId, params) { const raw = asParamRecord(params); diff --git a/extensions/google-meet/openclaw.plugin.json b/extensions/google-meet/openclaw.plugin.json index e8099751421..e0db9c5d2ea 100644 --- a/extensions/google-meet/openclaw.plugin.json +++ b/extensions/google-meet/openclaw.plugin.json @@ -28,7 +28,7 @@ }, "defaultMode": { "label": "Default Mode", - "help": "Realtime voice is the default." + "help": "Agent uses realtime transcription plus regular OpenClaw TTS. Bidi uses the realtime voice model directly. Transcribe observes only." }, "chrome.audioBackend": { "label": "Chrome Audio Backend", @@ -145,7 +145,7 @@ }, "realtime.strategy": { "label": "Realtime Strategy", - "help": "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly." + "help": "Legacy realtime alias setting. Use mode=agent or mode=bidi for new Meet joins." }, "realtime.provider": { "label": "Realtime Provider", @@ -227,8 +227,8 @@ }, "defaultMode": { "type": "string", - "enum": ["realtime", "transcribe"], - "default": "realtime" + "enum": ["agent", "bidi", "realtime", "transcribe"], + "default": "agent" }, "chrome": { "type": "object", @@ -422,7 +422,7 @@ }, "instructions": { "type": "string", - "default": "You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent strategy, wait for OpenClaw consult results and speak them exactly. In bidi strategy, answer directly and call openclaw_agent_consult for deeper reasoning, current information, or tools." + "default": "You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent mode, wait for OpenClaw consult results and speak them exactly. In bidi mode, answer directly and call openclaw_agent_consult for deeper reasoning, current information, or tools." }, "introMessage": { "type": "string", diff --git a/extensions/google-meet/src/cli.test.ts b/extensions/google-meet/src/cli.test.ts index e8b008aa0e4..2c1dae2738a 100644 --- a/extensions/google-meet/src/cli.test.ts +++ b/extensions/google-meet/src/cli.test.ts @@ -228,7 +228,7 @@ describe("google-meet CLI", () => { { id: "audio-bridge", ok: true, - message: "Chrome command-pair realtime audio bridge configured (pcm16-24khz)", + message: "Chrome command-pair talk-back audio bridge configured (pcm16-24khz)", }, ], }), @@ -236,7 +236,7 @@ describe("google-meet CLI", () => { }).parseAsync(["googlemeet", "setup"], { from: "user" }); expect(stdout.output()).toContain("Google Meet setup: OK"); expect(stdout.output()).toContain( - "[ok] audio-bridge: Chrome command-pair realtime audio bridge configured (pcm16-24khz)", + "[ok] audio-bridge: Chrome command-pair talk-back audio bridge configured (pcm16-24khz)", ); expect(stdout.output()).not.toContain('"checks"'); } finally { @@ -675,7 +675,7 @@ describe("google-meet CLI", () => { url: "https://meet.google.com/abc-defg-hij", state: "active", transport: "twilio", - mode: "realtime", + mode: "agent", participantIdentity: "Twilio PSTN participant", createdAt: "2026-04-25T00:00:00.000Z", updatedAt: "2026-04-25T00:00:01.000Z", @@ -704,7 +704,7 @@ describe("google-meet CLI", () => { url: "https://meet.google.com/abc-defg-hij", state: "active", transport: "chrome-node", - mode: "realtime", + mode: "agent", participantIdentity: "signed-in Google Chrome profile on a paired node", createdAt: "2026-04-25T00:00:00.000Z", updatedAt: "2026-04-25T00:00:01.000Z", @@ -908,7 +908,7 @@ describe("google-meet CLI", () => { url: "https://meet.google.com/abc-defg-hij", state: "active", transport: "chrome-node", - mode: "realtime", + mode: "agent", participantIdentity: "signed-in Google Chrome profile on a paired node", createdAt: "2026-04-25T00:00:00.000Z", updatedAt: "2026-04-25T00:00:01.000Z", @@ -964,7 +964,7 @@ describe("google-meet CLI", () => { url: "https://meet.google.com/abc-defg-hij", state: "active", transport: "twilio", - mode: "realtime", + mode: "agent", participantIdentity: "Twilio phone participant", createdAt: "2026-04-25T00:00:00.000Z", updatedAt: "2026-04-25T00:00:01.000Z", diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index edb78134f2e..b688f241346 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -1481,10 +1481,7 @@ export function registerGoogleMeetCli(params: { ) .option("--no-join", "Only create the meeting URL; do not join it") .option("--transport ", "Join transport: chrome, chrome-node, or twilio") - .option( - "--mode ", - "Join mode: realtime for live talk-back, transcribe for observe/control", - ) + .option("--mode ", "Join mode: agent, bidi, or transcribe") .option("--message ", "Realtime speech to trigger after join") .option("--dial-in-number ", "Meet dial-in number for Twilio transport") .option("--pin ", "Meet phone PIN; # is appended if omitted") @@ -1665,10 +1662,7 @@ export function registerGoogleMeetCli(params: { .command("join") .argument("[url]", "Explicit https://meet.google.com/... URL") .option("--transport ", "Transport: chrome, chrome-node, or twilio") - .option( - "--mode ", - "Mode: realtime for live talk-back, transcribe to join without the realtime voice bridge", - ) + .option("--mode ", "Mode: agent, bidi, or transcribe") .option("--message ", "Realtime speech to trigger after join") .option("--dial-in-number ", "Meet dial-in number for Twilio transport") .option("--pin ", "Meet phone PIN; # is appended if omitted") @@ -1703,10 +1697,7 @@ export function registerGoogleMeetCli(params: { .command("test-speech") .argument("[url]", "Explicit https://meet.google.com/... URL") .option("--transport ", "Transport: chrome, chrome-node, or twilio") - .option( - "--mode ", - "Mode: realtime for live talk-back, transcribe to join without the realtime voice bridge", - ) + .option("--mode ", "Mode: agent, bidi, or transcribe") .option( "--message ", "Realtime speech to trigger", diff --git a/extensions/google-meet/src/config.ts b/extensions/google-meet/src/config.ts index 5a094f96d24..ae423e89052 100644 --- a/extensions/google-meet/src/config.ts +++ b/extensions/google-meet/src/config.ts @@ -9,7 +9,8 @@ import { } from "openclaw/plugin-sdk/text-runtime"; export type GoogleMeetTransport = "chrome" | "chrome-node" | "twilio"; -export type GoogleMeetMode = "realtime" | "transcribe"; +export type GoogleMeetMode = "agent" | "bidi" | "transcribe"; +export type GoogleMeetModeInput = GoogleMeetMode | "realtime"; export type GoogleMeetRealtimeStrategy = "agent" | "bidi"; type GoogleMeetChromeAudioFormat = "pcm16-24khz" | "g711-ulaw-8khz"; export type GoogleMeetToolPolicy = RealtimeVoiceAgentConsultToolPolicy; @@ -162,7 +163,7 @@ const DEFAULT_GOOGLE_MEET_BARGE_IN_RMS_THRESHOLD = 650; const DEFAULT_GOOGLE_MEET_BARGE_IN_PEAK_THRESHOLD = 2500; const DEFAULT_GOOGLE_MEET_BARGE_IN_COOLDOWN_MS = 900; -const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent strategy, wait for OpenClaw consult results and speak them exactly. In bidi strategy, answer directly and call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} for deeper reasoning, current information, or tools.`; +const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent mode, wait for OpenClaw consult results and speak them exactly. In bidi mode, answer directly and call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} for deeper reasoning, current information, or tools.`; const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening."; const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = { @@ -172,7 +173,7 @@ const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = { enrollmentAcknowledged: false, }, defaultTransport: "chrome", - defaultMode: "realtime", + defaultMode: "agent", chrome: { audioBackend: "blackhole-2ch", audioFormat: DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT, @@ -325,7 +326,12 @@ function resolveTransport(value: unknown, fallback: GoogleMeetTransport): Google function resolveMode(value: unknown, fallback: GoogleMeetMode): GoogleMeetMode { const normalized = normalizeOptionalLowercaseString(value); - return normalized === "realtime" || normalized === "transcribe" ? normalized : fallback; + if (normalized === "realtime") { + return "agent"; + } + return normalized === "agent" || normalized === "bidi" || normalized === "transcribe" + ? normalized + : fallback; } function resolveRealtimeStrategy( diff --git a/extensions/google-meet/src/create.ts b/extensions/google-meet/src/create.ts index 04e6d81eb9c..a557ef275c1 100644 --- a/extensions/google-meet/src/create.ts +++ b/extensions/google-meet/src/create.ts @@ -16,7 +16,10 @@ function normalizeTransport(value: unknown): GoogleMeetTransport | undefined { } function normalizeMode(value: unknown): GoogleMeetMode | undefined { - return value === "realtime" || value === "transcribe" ? value : undefined; + if (value === "realtime") { + return "agent"; + } + return value === "agent" || value === "bidi" || value === "transcribe" ? value : undefined; } function normalizeGoogleMeetAccessType(value: unknown): GoogleMeetAccessType | undefined { diff --git a/extensions/google-meet/src/node-host.ts b/extensions/google-meet/src/node-host.ts index 84912d8dbdc..21103265e6d 100644 --- a/extensions/google-meet/src/node-host.ts +++ b/extensions/google-meet/src/node-host.ts @@ -284,7 +284,7 @@ function startChrome(params: Record) { let bridgeId: string | undefined; let audioBridge: { type: "external-command" | "node-command-pair" } | undefined; - if (mode === "realtime") { + if (mode === "agent" || mode === "bidi" || mode === "realtime") { assertBlackHoleAvailable(Math.min(timeoutMs, 10_000)); const healthCommand = readStringArray(params.audioBridgeHealthCommand); @@ -299,6 +299,11 @@ function startChrome(params: Record) { const bridgeCommand = readStringArray(params.audioBridgeCommand); if (bridgeCommand) { + if (mode === "agent") { + throw new Error( + "Chrome agent mode requires audioInputCommand and audioOutputCommand so OpenClaw can run STT and regular TTS directly.", + ); + } const bridge = runCommandWithTimeout(bridgeCommand, timeoutMs); if (bridge.code !== 0) { throw new Error( diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index f44d02c6d77..96569cb4885 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -1,6 +1,10 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; +import type { + RealtimeTranscriptionProviderPlugin, + RealtimeTranscriptionSession, +} from "openclaw/plugin-sdk/realtime-transcription"; import { createRealtimeVoiceBridgeSession, type RealtimeVoiceBridgeSession, @@ -23,7 +27,10 @@ import { recordGoogleMeetRealtimeEvent, resolveGoogleMeetRealtimeAudioFormat, resolveGoogleMeetRealtimeProvider, + resolveGoogleMeetRealtimeTranscriptionProvider, isGoogleMeetLikelyAssistantEchoTranscript, + convertGoogleMeetBridgeAudioForStt, + convertGoogleMeetTtsAudioForBridge, type GoogleMeetRealtimeEventEntry, type GoogleMeetRealtimeTranscriptEntry, } from "./realtime.js"; @@ -49,6 +56,307 @@ function readString(value: unknown): string | undefined { return typeof value === "string" && value.trim() ? value : undefined; } +function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined { + const trimmed = text?.trim(); + if (!trimmed) { + return undefined; + } + const sayExactly = trimmed.match(/^say exactly:\s*(?.+)$/is)?.groups?.text?.trim(); + if (sayExactly) { + return sayExactly.replace(/^["']|["']$/g, "").trim() || trimmed; + } + return trimmed; +} + +export async function startNodeAgentAudioBridge(params: { + config: GoogleMeetConfig; + fullConfig: OpenClawConfig; + runtime: PluginRuntime; + meetingSessionId: string; + nodeId: string; + bridgeId: string; + logger: RuntimeLogger; + providers?: RealtimeTranscriptionProviderPlugin[]; +}): Promise { + let stopped = false; + let sttSession: RealtimeTranscriptionSession | null = null; + let realtimeReady = false; + let lastInputAt: string | undefined; + let lastOutputAt: string | undefined; + let lastInputBytes = 0; + let lastOutputBytes = 0; + let suppressedInputBytes = 0; + let lastSuppressedInputAt: string | undefined; + let suppressInputUntil = 0; + let lastOutputPlayableUntilMs = 0; + let consecutiveInputErrors = 0; + let lastInputError: string | undefined; + const resolved = resolveGoogleMeetRealtimeTranscriptionProvider({ + config: params.config, + fullConfig: params.fullConfig, + providers: params.providers, + }); + const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; + let agentConsultActive = false; + let pendingAgentQuestion: string | undefined; + let agentConsultDebounceTimer: ReturnType | undefined; + let ttsQueue = Promise.resolve(); + + const stop = async () => { + if (stopped) { + return; + } + stopped = true; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + agentConsultDebounceTimer = undefined; + } + try { + sttSession?.close(); + } catch (error) { + params.logger.debug?.( + `[google-meet] node agent transcription bridge close ignored: ${formatErrorMessage(error)}`, + ); + } + try { + await params.runtime.nodes.invoke({ + nodeId: params.nodeId, + command: "googlemeet.chrome", + params: { action: "stop", bridgeId: params.bridgeId }, + timeoutMs: 5_000, + }); + } catch (error) { + params.logger.debug?.( + `[google-meet] node audio bridge stop ignored: ${formatErrorMessage(error)}`, + ); + } + }; + + const pushOutputAudio = async (audio: Buffer) => { + const suppression = extendGoogleMeetOutputEchoSuppression({ + audio, + audioFormat: params.config.chrome.audioFormat, + nowMs: Date.now(), + lastOutputPlayableUntilMs, + suppressInputUntilMs: suppressInputUntil, + }); + suppressInputUntil = suppression.suppressInputUntilMs; + lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs; + lastOutputAt = new Date().toISOString(); + lastOutputBytes += audio.byteLength; + await params.runtime.nodes.invoke({ + nodeId: params.nodeId, + command: "googlemeet.chrome", + params: { + action: "pushAudio", + bridgeId: params.bridgeId, + base64: Buffer.from(audio).toString("base64"), + }, + timeoutMs: 5_000, + }); + }; + + const enqueueSpeakText = (text: string | undefined) => { + const normalized = normalizeGoogleMeetTtsPromptText(text); + if (!normalized || stopped) { + return; + } + ttsQueue = ttsQueue + .then(async () => { + if (stopped) { + return; + } + recordGoogleMeetRealtimeTranscript(transcript, "assistant", normalized); + params.logger.info(`[google-meet] node agent assistant: ${normalized}`); + const result = await params.runtime.tts.textToSpeechTelephony({ + text: normalized, + cfg: params.fullConfig, + }); + if (!result.success || !result.audioBuffer || !result.sampleRate) { + throw new Error(result.error ?? "TTS conversion failed"); + } + await pushOutputAudio( + convertGoogleMeetTtsAudioForBridge( + result.audioBuffer, + result.sampleRate, + params.config, + result.outputFormat, + ), + ); + }) + .catch((error) => { + params.logger.warn(`[google-meet] node agent TTS failed: ${formatErrorMessage(error)}`); + }); + }; + + const runAgentConsultForUserTranscript = async (question: string): Promise => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + if (agentConsultActive) { + pendingAgentQuestion = trimmed; + return; + } + agentConsultActive = true; + let nextQuestion: string | undefined = trimmed; + try { + while (nextQuestion) { + if (stopped) { + return; + } + const currentQuestion = nextQuestion; + pendingAgentQuestion = undefined; + params.logger.info(`[google-meet] node agent consult: ${currentQuestion}`); + const result = await consultOpenClawAgentForGoogleMeet({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + logger: params.logger, + meetingSessionId: params.meetingSessionId, + args: { + question: currentQuestion, + responseStyle: "Brief, natural spoken answer for a live meeting.", + }, + transcript, + }); + enqueueSpeakText(result.text); + nextQuestion = pendingAgentQuestion; + } + } catch (error) { + params.logger.warn(`[google-meet] node agent consult failed: ${formatErrorMessage(error)}`); + enqueueSpeakText("I hit an error while checking that. Please try again."); + } finally { + agentConsultActive = false; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + } + }; + + const enqueueAgentConsultForUserTranscript = (question: string): void => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + } + agentConsultDebounceTimer = setTimeout(() => { + agentConsultDebounceTimer = undefined; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + }, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS); + agentConsultDebounceTimer.unref?.(); + }; + + sttSession = resolved.provider.createSession({ + providerConfig: resolved.providerConfig, + onTranscript: (text) => { + const trimmed = text.trim(); + if (!trimmed || stopped) { + return; + } + recordGoogleMeetRealtimeTranscript(transcript, "user", trimmed); + params.logger.info(`[google-meet] node agent user: ${trimmed}`); + if (isGoogleMeetLikelyAssistantEchoTranscript({ transcript, text: trimmed })) { + params.logger.info( + `[google-meet] node agent ignored assistant echo transcript: ${trimmed}`, + ); + return; + } + enqueueAgentConsultForUserTranscript(trimmed); + }, + onError: (error) => { + params.logger.warn( + `[google-meet] node agent transcription bridge failed: ${formatErrorMessage(error)}`, + ); + void stop(); + }, + }); + await sttSession.connect(); + realtimeReady = true; + + void (async () => { + for (;;) { + if (stopped) { + break; + } + try { + const raw = await params.runtime.nodes.invoke({ + nodeId: params.nodeId, + command: "googlemeet.chrome", + params: { action: "pullAudio", bridgeId: params.bridgeId, timeoutMs: 250 }, + timeoutMs: 2_000, + }); + const result = asRecord(asRecord(raw).payload ?? raw); + consecutiveInputErrors = 0; + lastInputError = undefined; + const base64 = readString(result.base64); + if (base64) { + const audio = Buffer.from(base64, "base64"); + if (Date.now() < suppressInputUntil) { + lastSuppressedInputAt = new Date().toISOString(); + suppressedInputBytes += audio.byteLength; + continue; + } + lastInputAt = new Date().toISOString(); + lastInputBytes += audio.byteLength; + sttSession?.sendAudio(convertGoogleMeetBridgeAudioForStt(audio, params.config)); + } + if (result.closed === true) { + await stop(); + } + } catch (error) { + if (!stopped) { + const message = formatErrorMessage(error); + consecutiveInputErrors += 1; + lastInputError = message; + params.logger.warn( + `[google-meet] node agent audio input failed (${consecutiveInputErrors}/5): ${message}`, + ); + if (consecutiveInputErrors >= 5 || /unknown bridgeId|bridge is not open/i.test(message)) { + await stop(); + } else { + await new Promise((resolve) => setTimeout(resolve, 250)); + } + } + } + } + })(); + + return { + type: "node-command-pair", + providerId: resolved.provider.id, + nodeId: params.nodeId, + bridgeId: params.bridgeId, + speak: enqueueSpeakText, + getHealth: () => ({ + providerConnected: sttSession?.isConnected() ?? false, + realtimeReady, + audioInputActive: lastInputBytes > 0, + audioOutputActive: lastOutputBytes > 0, + lastInputAt, + lastOutputAt, + lastSuppressedInputAt, + lastInputBytes, + lastOutputBytes, + suppressedInputBytes, + ...getGoogleMeetRealtimeTranscriptHealth(transcript), + consecutiveInputErrors, + lastInputError, + bridgeClosed: stopped, + }), + stop, + }; +} + export async function startNodeRealtimeAudioBridge(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 872f4b5f357..c8d8ee92c4f 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -3,10 +3,20 @@ import type { Writable } from "node:stream"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; +import { + getRealtimeTranscriptionProvider, + listRealtimeTranscriptionProviders, + type RealtimeTranscriptionProviderConfig, + type RealtimeTranscriptionProviderPlugin, + type RealtimeTranscriptionSession, +} from "openclaw/plugin-sdk/realtime-transcription"; import { createRealtimeVoiceBridgeSession, + convertPcmToMulaw8k, + mulawToPcm, REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + resamplePcm, resolveConfiguredRealtimeVoiceProvider, type RealtimeVoiceBridgeSession, type RealtimeVoiceBridgeEvent, @@ -56,6 +66,11 @@ type ResolvedRealtimeProvider = { providerConfig: RealtimeVoiceProviderConfig; }; +type ResolvedRealtimeTranscriptionProvider = { + provider: RealtimeTranscriptionProviderPlugin; + providerConfig: RealtimeTranscriptionProviderConfig; +}; + export type GoogleMeetRealtimeTranscriptEntry = { at: string; role: "user" | "assistant"; @@ -243,6 +258,100 @@ export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) { : REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ; } +export function convertGoogleMeetBridgeAudioForStt( + audio: Buffer, + config: GoogleMeetConfig, +): Buffer { + if (config.chrome.audioFormat === "g711-ulaw-8khz") { + return audio; + } + return convertPcmToMulaw8k(audio, 24_000); +} + +export function convertGoogleMeetTtsAudioForBridge( + audio: Buffer, + sampleRate: number, + config: GoogleMeetConfig, + outputFormat?: string, +): Buffer { + const sourceFormat = sourceTelephonyTtsFormat(outputFormat); + if ( + config.chrome.audioFormat === "g711-ulaw-8khz" && + sourceFormat === "mulaw" && + sampleRate === 8_000 + ) { + return audio; + } + const pcm = decodeGoogleMeetTelephonyTtsAudio(audio, sourceFormat); + return config.chrome.audioFormat === "g711-ulaw-8khz" + ? convertPcmToMulaw8k(pcm, sampleRate) + : resamplePcm(pcm, sampleRate, 24_000); +} + +type GoogleMeetTelephonyTtsFormat = "pcm" | "mulaw" | "alaw"; + +function sourceTelephonyTtsFormat(outputFormat: string | undefined): GoogleMeetTelephonyTtsFormat { + const normalized = outputFormat?.trim().toLowerCase().replaceAll("_", "-") ?? ""; + if ( + !normalized || + normalized === "pcm" || + normalized.startsWith("pcm-") || + normalized.includes("pcm16") || + normalized.includes("16bit-mono-pcm") + ) { + return "pcm"; + } + if ( + normalized === "mulaw" || + normalized === "ulaw" || + normalized.includes("mu-law") || + normalized.includes("mulaw") || + normalized.includes("ulaw") + ) { + return "mulaw"; + } + if (normalized === "alaw" || normalized.includes("a-law") || normalized.includes("alaw")) { + return "alaw"; + } + throw new Error(`Unsupported telephony TTS output format for Google Meet: ${outputFormat}`); +} + +function decodeGoogleMeetTelephonyTtsAudio( + audio: Buffer, + sourceFormat: GoogleMeetTelephonyTtsFormat, +): Buffer { + switch (sourceFormat) { + case "pcm": + return audio; + case "mulaw": + return mulawToPcm(audio); + case "alaw": + return alawToPcm(audio); + } + return unsupportedGoogleMeetTelephonyTtsFormat(sourceFormat); +} + +function unsupportedGoogleMeetTelephonyTtsFormat(_format: never): never { + throw new Error("Unsupported telephony TTS output format for Google Meet"); +} + +function alawToPcm(alaw: Buffer): Buffer { + const pcm = Buffer.alloc(alaw.length * 2); + for (let index = 0; index < alaw.length; index += 1) { + pcm.writeInt16LE(alawByteToLinear(alaw[index] ?? 0), index * 2); + } + return pcm; +} + +function alawByteToLinear(value: number): number { + const aLaw = value ^ 0x55; + const sign = aLaw & 0x80; + const exponent = (aLaw & 0x70) >> 4; + const mantissa = aLaw & 0x0f; + let sample = exponent === 0 ? (mantissa << 4) + 8 : ((mantissa << 4) + 0x108) << (exponent - 1); + return sign ? sample : -sample; +} + export function resolveGoogleMeetRealtimeProvider(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; @@ -258,6 +367,40 @@ export function resolveGoogleMeetRealtimeProvider(params: { }); } +export function resolveGoogleMeetRealtimeTranscriptionProvider(params: { + config: GoogleMeetConfig; + fullConfig: OpenClawConfig; + providers?: RealtimeTranscriptionProviderPlugin[]; +}): ResolvedRealtimeTranscriptionProvider { + const providers = params.providers ?? listRealtimeTranscriptionProviders(params.fullConfig); + if (providers.length === 0) { + throw new Error("No configured realtime transcription provider registered"); + } + const configuredProvider = params.config.realtime.provider + ? (params.providers?.find( + (entry) => + entry.id === params.config.realtime.provider || + entry.aliases?.includes(params.config.realtime.provider ?? ""), + ) ?? getRealtimeTranscriptionProvider(params.config.realtime.provider, params.fullConfig)) + : undefined; + const provider = configuredProvider ?? providers[0]; + if (!provider) { + throw new Error("No configured realtime transcription provider registered"); + } + const rawConfig = params.config.realtime.provider + ? (params.config.realtime.providers[params.config.realtime.provider] ?? + params.config.realtime.providers[provider.id] ?? + {}) + : (params.config.realtime.providers[provider.id] ?? {}); + const providerConfig = provider.resolveConfig + ? provider.resolveConfig({ cfg: params.fullConfig, rawConfig }) + : rawConfig; + if (!provider.isConfigured({ cfg: params.fullConfig, providerConfig })) { + throw new Error(`Realtime transcription provider "${provider.id}" is not configured`); + } + return { provider, providerConfig }; +} + export function buildGoogleMeetSpeakExactUserMessage(text: string): string { return [ "Speak this exact OpenClaw answer to the meeting, without adding, removing, or rephrasing words.", @@ -265,6 +408,319 @@ export function buildGoogleMeetSpeakExactUserMessage(text: string): string { ].join("\n"); } +function normalizeGoogleMeetTtsPromptText(text: string | undefined): string | undefined { + const trimmed = text?.trim(); + if (!trimmed) { + return undefined; + } + const sayExactly = trimmed.match(/^say exactly:\s*(?.+)$/is)?.groups?.text?.trim(); + if (sayExactly) { + return sayExactly.replace(/^["']|["']$/g, "").trim() || trimmed; + } + return trimmed; +} + +export async function startCommandAgentAudioBridge(params: { + config: GoogleMeetConfig; + fullConfig: OpenClawConfig; + runtime: PluginRuntime; + meetingSessionId: string; + inputCommand: string[]; + outputCommand: string[]; + logger: RuntimeLogger; + providers?: RealtimeTranscriptionProviderPlugin[]; + spawn?: SpawnFn; +}): Promise { + const input = splitCommand(params.inputCommand); + const output = splitCommand(params.outputCommand); + const spawnFn: SpawnFn = + params.spawn ?? + ((command, args, options) => spawn(command, args, options) as unknown as BridgeProcess); + const outputProcess = spawnFn(output.command, output.args, { + stdio: ["pipe", "ignore", "pipe"], + }); + const inputProcess = spawnFn(input.command, input.args, { + stdio: ["ignore", "pipe", "pipe"], + }); + let stopped = false; + let sttSession: RealtimeTranscriptionSession | null = null; + let realtimeReady = false; + let lastInputAt: string | undefined; + let lastOutputAt: string | undefined; + let lastInputBytes = 0; + let lastOutputBytes = 0; + let suppressedInputBytes = 0; + let lastSuppressedInputAt: string | undefined; + let suppressInputUntil = 0; + let lastOutputPlayableUntilMs = 0; + let agentConsultActive = false; + let pendingAgentQuestion: string | undefined; + let agentConsultDebounceTimer: ReturnType | undefined; + let ttsQueue = Promise.resolve(); + const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; + const resolved = resolveGoogleMeetRealtimeTranscriptionProvider({ + config: params.config, + fullConfig: params.fullConfig, + providers: params.providers, + }); + + const terminateProcess = (proc: BridgeProcess, signal: NodeJS.Signals = "SIGTERM") => { + if (proc.killed && signal !== "SIGKILL") { + return; + } + let exited = false; + proc.on("exit", () => { + exited = true; + }); + try { + proc.kill(signal); + } catch { + return; + } + if (signal === "SIGKILL") { + return; + } + const timer = setTimeout(() => { + if (!exited) { + try { + proc.kill("SIGKILL"); + } catch { + // Process may have exited after the grace check. + } + } + }, 1000); + timer.unref?.(); + }; + + const stop = async () => { + if (stopped) { + return; + } + stopped = true; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + agentConsultDebounceTimer = undefined; + } + try { + sttSession?.close(); + } catch (error) { + params.logger.debug?.( + `[google-meet] agent transcription bridge close ignored: ${formatErrorMessage(error)}`, + ); + } + terminateProcess(inputProcess); + terminateProcess(outputProcess); + }; + + const fail = (label: string) => (error: Error) => { + params.logger.warn(`[google-meet] ${label} failed: ${formatErrorMessage(error)}`); + void stop(); + }; + inputProcess.on("error", fail("audio input command")); + inputProcess.on("exit", (code, signal) => { + if (!stopped) { + params.logger.warn(`[google-meet] audio input command exited (${code ?? signal ?? "done"})`); + void stop(); + } + }); + inputProcess.stderr?.on("data", (chunk) => { + params.logger.debug?.(`[google-meet] audio input: ${String(chunk).trim()}`); + }); + outputProcess.on("error", fail("audio output command")); + outputProcess.stdin?.on?.("error", fail("audio output command")); + outputProcess.on("exit", (code, signal) => { + if (!stopped) { + params.logger.warn(`[google-meet] audio output command exited (${code ?? signal ?? "done"})`); + void stop(); + } + }); + outputProcess.stderr?.on("data", (chunk) => { + params.logger.debug?.(`[google-meet] audio output: ${String(chunk).trim()}`); + }); + + const writeOutputAudio = (audio: Buffer) => { + const suppression = extendGoogleMeetOutputEchoSuppression({ + audio, + audioFormat: params.config.chrome.audioFormat, + nowMs: Date.now(), + lastOutputPlayableUntilMs, + suppressInputUntilMs: suppressInputUntil, + }); + suppressInputUntil = suppression.suppressInputUntilMs; + lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs; + lastOutputAt = new Date().toISOString(); + lastOutputBytes += audio.byteLength; + try { + outputProcess.stdin?.write(audio); + } catch (error) { + fail("audio output command")(error as Error); + } + }; + + const enqueueSpeakText = (text: string | undefined) => { + const normalized = normalizeGoogleMeetTtsPromptText(text); + if (!normalized || stopped) { + return; + } + ttsQueue = ttsQueue + .then(async () => { + if (stopped) { + return; + } + recordGoogleMeetRealtimeTranscript(transcript, "assistant", normalized); + params.logger.info(`[google-meet] agent assistant: ${normalized}`); + const result = await params.runtime.tts.textToSpeechTelephony({ + text: normalized, + cfg: params.fullConfig, + }); + if (!result.success || !result.audioBuffer || !result.sampleRate) { + throw new Error(result.error ?? "TTS conversion failed"); + } + writeOutputAudio( + convertGoogleMeetTtsAudioForBridge( + result.audioBuffer, + result.sampleRate, + params.config, + result.outputFormat, + ), + ); + }) + .catch((error) => { + params.logger.warn(`[google-meet] agent TTS failed: ${formatErrorMessage(error)}`); + }); + }; + + const runAgentConsultForUserTranscript = async (question: string): Promise => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + if (agentConsultActive) { + pendingAgentQuestion = trimmed; + return; + } + agentConsultActive = true; + let nextQuestion: string | undefined = trimmed; + try { + while (nextQuestion) { + if (stopped) { + return; + } + const currentQuestion = nextQuestion; + pendingAgentQuestion = undefined; + params.logger.info(`[google-meet] agent consult: ${currentQuestion}`); + const result = await consultOpenClawAgentForGoogleMeet({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + logger: params.logger, + meetingSessionId: params.meetingSessionId, + args: { + question: currentQuestion, + responseStyle: "Brief, natural spoken answer for a live meeting.", + }, + transcript, + }); + enqueueSpeakText(result.text); + nextQuestion = pendingAgentQuestion; + } + } catch (error) { + params.logger.warn(`[google-meet] agent consult failed: ${formatErrorMessage(error)}`); + enqueueSpeakText("I hit an error while checking that. Please try again."); + } finally { + agentConsultActive = false; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + } + }; + + const enqueueAgentConsultForUserTranscript = (question: string): void => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + } + agentConsultDebounceTimer = setTimeout(() => { + agentConsultDebounceTimer = undefined; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + }, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS); + agentConsultDebounceTimer.unref?.(); + }; + + sttSession = resolved.provider.createSession({ + providerConfig: resolved.providerConfig, + onTranscript: (text) => { + const trimmed = text.trim(); + if (!trimmed || stopped) { + return; + } + recordGoogleMeetRealtimeTranscript(transcript, "user", trimmed); + params.logger.info(`[google-meet] agent user: ${trimmed}`); + if (isGoogleMeetLikelyAssistantEchoTranscript({ transcript, text: trimmed })) { + params.logger.info(`[google-meet] agent ignored assistant echo transcript: ${trimmed}`); + return; + } + enqueueAgentConsultForUserTranscript(trimmed); + }, + onError: (error) => { + params.logger.warn( + `[google-meet] agent transcription bridge failed: ${formatErrorMessage(error)}`, + ); + void stop(); + }, + }); + + await sttSession.connect(); + realtimeReady = true; + + inputProcess.stdout?.on("data", (chunk) => { + if (stopped) { + return; + } + const audio = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk); + if (Date.now() < suppressInputUntil) { + lastSuppressedInputAt = new Date().toISOString(); + suppressedInputBytes += audio.byteLength; + return; + } + lastInputAt = new Date().toISOString(); + lastInputBytes += audio.byteLength; + sttSession?.sendAudio(convertGoogleMeetBridgeAudioForStt(audio, params.config)); + }); + + return { + providerId: resolved.provider.id, + inputCommand: params.inputCommand, + outputCommand: params.outputCommand, + speak: enqueueSpeakText, + getHealth: () => ({ + providerConnected: sttSession?.isConnected() ?? false, + realtimeReady, + audioInputActive: lastInputBytes > 0, + audioOutputActive: lastOutputBytes > 0, + lastInputAt, + lastOutputAt, + lastSuppressedInputAt, + lastInputBytes, + lastOutputBytes, + suppressedInputBytes, + ...getGoogleMeetRealtimeTranscriptHealth(transcript), + bridgeClosed: stopped, + }), + stop, + }; +} + export async function startCommandRealtimeAudioBridge(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index daa5c2716a6..003afc8b5e0 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -64,6 +64,10 @@ function resolveMode(input: GoogleMeetMode | undefined, config: GoogleMeetConfig return input ?? config.defaultMode; } +function isGoogleMeetTalkBackMode(mode: GoogleMeetMode): boolean { + return mode === "agent" || mode === "bidi"; +} + function hasRealtimeAudioOutputAdvanced( health: GoogleMeetChromeHealth | undefined, startOutputBytes: number, @@ -125,7 +129,7 @@ function evaluateSpeechReadiness(session: GoogleMeetSession): { reason?: NonNullable; message?: string; } { - if (session.mode !== "realtime" || !session.chrome) { + if (!isGoogleMeetTalkBackMode(session.mode) || !session.chrome) { return { ready: true }; } if (!isManagedChromeBrowserSession(session)) { @@ -278,7 +282,7 @@ export class GoogleMeetRuntime { }); } } - if (transport === "chrome" && mode === "realtime") { + if (transport === "chrome" && isGoogleMeetTalkBackMode(mode)) { try { await assertBlackHole2chAvailable({ runtime: this.params.runtime, @@ -313,7 +317,7 @@ export class GoogleMeetRuntime { ok: commands.length > 0 && missingCommands.length === 0, message: commands.length === 0 - ? "Chrome realtime audio commands are not configured" + ? "Chrome talk-back audio commands are not configured" : missingCommands.length === 0 ? `Chrome audio command${commands.length === 1 ? "" : "s"} available: ${commands.join(", ")}` : `Chrome audio command${missingCommands.length === 1 ? "" : "s"} missing: ${missingCommands.join(", ")}`, @@ -368,7 +372,7 @@ export class GoogleMeetRuntime { ]; reusable.updatedAt = nowIso(); const spoken = - mode === "realtime" && speechInstructions + isGoogleMeetTalkBackMode(mode) && speechInstructions ? await this.#speakWhenReady(reusable, speechInstructions) : false; return { session: reusable, spoken }; @@ -391,8 +395,8 @@ export class GoogleMeetRuntime { ? "signed-in Google Chrome profile on a paired node" : "signed-in Google Chrome profile", realtime: { - enabled: mode === "realtime", - strategy: this.params.config.realtime.strategy, + enabled: isGoogleMeetTalkBackMode(mode), + strategy: mode === "bidi" ? "bidi" : "agent", provider: this.params.config.realtime.provider, model: this.params.config.realtime.model, toolPolicy: this.params.config.realtime.toolPolicy, @@ -435,7 +439,7 @@ export class GoogleMeetRuntime { ? transport === "chrome-node" ? "Chrome node transport joins as the signed-in Google profile on the selected node and routes realtime audio through the node bridge." : "Chrome transport joins as the signed-in Google profile and routes realtime audio through the configured bridge." - : mode === "realtime" + : isGoogleMeetTalkBackMode(mode) ? "Chrome transport joins as the signed-in Google profile and expects BlackHole 2ch audio routing." : "Chrome transport joins as the signed-in Google profile without starting the realtime audio bridge.", ); @@ -459,12 +463,11 @@ export class GoogleMeetRuntime { dialInNumber, dtmfSequence, logger: this.params.logger, - message: - mode === "realtime" - ? (request.message ?? - this.params.config.voiceCall.introMessage ?? - this.params.config.realtime.introMessage) - : undefined, + message: isGoogleMeetTalkBackMode(mode) + ? (request.message ?? + this.params.config.voiceCall.introMessage ?? + this.params.config.realtime.introMessage) + : undefined, }) : undefined; delegatedTwilioSpoken = Boolean(voiceCallResult?.introSent); @@ -501,7 +504,7 @@ export class GoogleMeetRuntime { const spoken = transport === "twilio" ? delegatedTwilioSpoken - : mode === "realtime" && speechInstructions + : isGoogleMeetTalkBackMode(mode) && speechInstructions ? await this.#speakWhenReady(session, speechInstructions) : false; return { session, spoken }; @@ -613,7 +616,7 @@ export class GoogleMeetRuntime { }> { if (request.mode === "transcribe") { throw new Error( - "test_speech requires mode: realtime; use join mode: transcribe for observe-only sessions.", + "test_speech requires mode: agent or bidi; use join mode: transcribe for observe-only sessions.", ); } const url = normalizeMeetUrl(request.url); @@ -625,14 +628,14 @@ export class GoogleMeetRuntime { session.state === "active" && isSameMeetUrlForReuse(session.url, url) && session.transport === transport && - session.mode === "realtime", + isGoogleMeetTalkBackMode(session.mode), ); const startOutputBytes = existingSession?.chrome?.health?.lastOutputBytes ?? 0; const result = await this.join({ ...request, transport, url, - mode: "realtime", + mode: "agent", message: request.message ?? "Say exactly: Google Meet speech test complete.", }); let health = result.session.chrome?.health; @@ -687,9 +690,9 @@ export class GoogleMeetRuntime { recentTranscript?: GoogleMeetChromeHealth["recentTranscript"]; session: GoogleMeetSession; }> { - if (request.mode === "realtime") { + if (request.mode && isGoogleMeetTalkBackMode(request.mode)) { throw new Error( - "test_listen requires mode: transcribe; use test_speech for realtime talk-back.", + "test_listen requires mode: transcribe; use test_speech for talk-back sessions.", ); } const url = normalizeMeetUrl(request.url); @@ -780,7 +783,11 @@ export class GoogleMeetRuntime { this.#refreshSpeechReadiness(session); return; } - if (!options.force && session.mode === "realtime" && evaluateSpeechReadiness(session).ready) { + if ( + !options.force && + isGoogleMeetTalkBackMode(session.mode) && + evaluateSpeechReadiness(session).ready + ) { this.#refreshSpeechReadiness(session); return; } @@ -838,7 +845,7 @@ export class GoogleMeetRuntime { async #ensureChromeRealtimeBridge(session: GoogleMeetSession) { if ( - session.mode !== "realtime" || + !isGoogleMeetTalkBackMode(session.mode) || session.transport !== "chrome" || session.state !== "active" || !session.chrome || diff --git a/extensions/google-meet/src/setup.ts b/extensions/google-meet/src/setup.ts index 8aefcdb3fee..17958b4f361 100644 --- a/extensions/google-meet/src/setup.ts +++ b/extensions/google-meet/src/setup.ts @@ -109,7 +109,8 @@ export function getGoogleMeetSetupStatus( const mode = options?.mode ?? config.defaultMode; const transport = options?.transport ?? config.defaultTransport; const needsChromeRealtimeAudio = - mode === "realtime" && (transport === "chrome" || transport === "chrome-node"); + (mode === "agent" || mode === "bidi") && + (transport === "chrome" || transport === "chrome-node"); const pluginEntries = asRecord(asRecord(fullConfig.plugins).entries); const pluginAllow = asRecord(fullConfig.plugins).allow; const voiceCallEntry = asRecord(pluginEntries["voice-call"]); @@ -142,17 +143,24 @@ export function getGoogleMeetSetupStatus( }); if (needsChromeRealtimeAudio) { + const hasCommandPair = Boolean( + config.chrome.audioInputCommand && config.chrome.audioOutputCommand, + ); + const hasExternalBridge = Boolean(config.chrome.audioBridgeCommand); + const agentModeExternalBridgeInvalid = mode === "agent" && hasExternalBridge; checks.push({ id: "audio-bridge", - ok: Boolean( - config.chrome.audioBridgeCommand || - (config.chrome.audioInputCommand && config.chrome.audioOutputCommand), - ), - message: config.chrome.audioBridgeCommand - ? "Chrome audio bridge command configured" - : config.chrome.audioInputCommand && config.chrome.audioOutputCommand - ? `Chrome command-pair realtime audio bridge configured (${config.chrome.audioFormat})` - : "Chrome realtime audio bridge not configured", + ok: + mode === "agent" + ? hasCommandPair && !agentModeExternalBridgeInvalid + : hasExternalBridge || hasCommandPair, + message: agentModeExternalBridgeInvalid + ? "Chrome agent mode requires chrome.audioInputCommand and chrome.audioOutputCommand; chrome.audioBridgeCommand is bidi-only" + : hasExternalBridge + ? "Chrome audio bridge command configured" + : hasCommandPair + ? `Chrome command-pair talk-back audio bridge configured (${config.chrome.audioFormat})` + : "Chrome talk-back audio bridge not configured", }); } else if (transport === "chrome" || transport === "chrome-node") { checks.push({ diff --git a/extensions/google-meet/src/transports/chrome.ts b/extensions/google-meet/src/transports/chrome.ts index 1d001717f43..d002265aeb1 100644 --- a/extensions/google-meet/src/transports/chrome.ts +++ b/extensions/google-meet/src/transports/chrome.ts @@ -2,12 +2,14 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/config-types"; import { callGatewayFromCli } from "openclaw/plugin-sdk/gateway-runtime"; import type { PluginRuntime } from "openclaw/plugin-sdk/plugin-runtime"; import type { RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; -import type { GoogleMeetConfig } from "../config.js"; +import type { GoogleMeetConfig, GoogleMeetMode } from "../config.js"; import { + startNodeAgentAudioBridge, startNodeRealtimeAudioBridge, type ChromeNodeRealtimeAudioBridgeHandle, } from "../realtime-node.js"; import { + startCommandAgentAudioBridge, startCommandRealtimeAudioBridge, type ChromeRealtimeAudioBridgeHandle, } from "../realtime.js"; @@ -46,6 +48,10 @@ export const __testing = { meetStatusScriptForTest: meetStatusScript, }; +function isGoogleMeetTalkBackMode(mode: GoogleMeetMode): boolean { + return mode === "agent" || mode === "bidi"; +} + export function outputMentionsBlackHole2ch(output: string): boolean { return /\bBlackHole\s+2ch\b/i.test(output); } @@ -86,7 +92,7 @@ export async function launchChromeMeet(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; meetingSessionId: string; - mode: "realtime" | "transcribe"; + mode: GoogleMeetMode; url: string; logger: RuntimeLogger; }): Promise<{ @@ -97,7 +103,7 @@ export async function launchChromeMeet(params: { browser?: GoogleMeetChromeHealth; }> { const checkRealtimeAudioPrerequisites = async () => { - if (params.mode !== "realtime") { + if (!isGoogleMeetTalkBackMode(params.mode)) { return; } await assertBlackHole2chAvailable({ @@ -123,10 +129,15 @@ export async function launchChromeMeet(params: { | ({ type: "command-pair" } & ChromeRealtimeAudioBridgeHandle) | undefined > => { - if (params.mode !== "realtime") { + if (!isGoogleMeetTalkBackMode(params.mode)) { return undefined; } if (params.config.chrome.audioBridgeCommand) { + if (params.mode === "agent") { + throw new Error( + "Chrome agent mode requires chrome.audioInputCommand and chrome.audioOutputCommand so OpenClaw can run STT and regular TTS directly.", + ); + } const bridge = await params.runtime.system.runCommandWithTimeout( params.config.chrome.audioBridgeCommand, { timeoutMs: params.config.chrome.joinTimeoutMs }, @@ -140,20 +151,33 @@ export async function launchChromeMeet(params: { } if (!params.config.chrome.audioInputCommand || !params.config.chrome.audioOutputCommand) { throw new Error( - "Chrome realtime mode requires chrome.audioInputCommand and chrome.audioOutputCommand, or chrome.audioBridgeCommand for an external bridge.", + "Chrome talk-back mode requires chrome.audioInputCommand and chrome.audioOutputCommand, or chrome.audioBridgeCommand for an external bridge.", ); } return { type: "command-pair", - ...(await startCommandRealtimeAudioBridge({ - config: params.config, - fullConfig: params.fullConfig, - runtime: params.runtime, - meetingSessionId: params.meetingSessionId, - inputCommand: params.config.chrome.audioInputCommand, - outputCommand: params.config.chrome.audioOutputCommand, - logger: params.logger, - })), + ...(params.mode === "agent" + ? await startCommandAgentAudioBridge({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + meetingSessionId: params.meetingSessionId, + inputCommand: params.config.chrome.audioInputCommand, + outputCommand: params.config.chrome.audioOutputCommand, + logger: params.logger, + }) + : await startCommandRealtimeAudioBridge({ + config: { + ...params.config, + realtime: { ...params.config.realtime, strategy: "bidi" }, + }, + fullConfig: params.fullConfig, + runtime: params.runtime, + meetingSessionId: params.meetingSessionId, + inputCommand: params.config.chrome.audioInputCommand, + outputCommand: params.config.chrome.audioOutputCommand, + logger: params.logger, + })), }; }; @@ -170,7 +194,7 @@ export async function launchChromeMeet(params: { url: params.url, }); const shouldStartRealtimeBridge = - params.mode === "realtime" && + isGoogleMeetTalkBackMode(params.mode) && result.browser?.inCall === true && result.browser.micMuted !== true && result.browser.manualActionRequired !== true; @@ -387,7 +411,7 @@ function meetStatusScript(params: { } if (!readOnly && allowMicrophone && mic && /turn on microphone/i.test(buttonLabel(mic))) { mic.click(); - notes.push("Attempted to turn on the Meet microphone for realtime mode."); + notes.push("Attempted to turn on the Meet microphone for talk-back mode."); } if (!readOnly && !allowMicrophone && mic && /turn off microphone/i.test(mic.getAttribute('aria-label') || text(mic))) { mic.click(); @@ -595,7 +619,7 @@ async function openMeetWithBrowserProxy(params: { runtime: PluginRuntime; nodeId: string; config: GoogleMeetConfig; - mode: "realtime" | "transcribe"; + mode: GoogleMeetMode; url: string; }): Promise<{ launched: boolean; browser?: GoogleMeetChromeHealth }> { return await openMeetWithBrowserRequest({ @@ -617,7 +641,7 @@ async function openMeetWithBrowserProxy(params: { async function openMeetWithBrowserRequest(params: { callBrowser: BrowserRequestCaller; config: GoogleMeetConfig; - mode: "realtime" | "transcribe"; + mode: GoogleMeetMode; url: string; }): Promise<{ launched: boolean; browser?: GoogleMeetChromeHealth }> { if (!params.config.chrome.launch) { @@ -670,7 +694,7 @@ async function openMeetWithBrowserRequest(params: { } const permissionNotes = await grantMeetMediaPermissions({ - allowMicrophone: params.mode === "realtime", + allowMicrophone: isGoogleMeetTalkBackMode(params.mode), callBrowser: params.callBrowser, targetId, timeoutMs, @@ -691,7 +715,7 @@ async function openMeetWithBrowserRequest(params: { kind: "evaluate", targetId, fn: meetStatusScript({ - allowMicrophone: params.mode === "realtime", + allowMicrophone: isGoogleMeetTalkBackMode(params.mode), captureCaptions: params.mode === "transcribe", guestName: params.config.chrome.guestName, autoJoin: params.config.chrome.autoJoin, @@ -700,7 +724,10 @@ async function openMeetWithBrowserRequest(params: { timeoutMs: Math.min(timeoutMs, 10_000), }); browser = mergeBrowserNotes(parseMeetBrowserStatus(evaluated) ?? browser, permissionNotes); - if (browser?.inCall === true && (params.mode !== "realtime" || browser.micMuted !== true)) { + if ( + browser?.inCall === true && + (!isGoogleMeetTalkBackMode(params.mode) || browser.micMuted !== true) + ) { return { launched: true, browser }; } if (browser?.manualActionRequired === true) { @@ -747,7 +774,7 @@ function isRecoverableMeetTab(tab: BrowserTab, url?: string): boolean { async function inspectRecoverableMeetTab(params: { callBrowser: BrowserRequestCaller; config: GoogleMeetConfig; - mode?: "realtime" | "transcribe"; + mode?: GoogleMeetMode; readOnly?: boolean; timeoutMs: number; tab: BrowserTab; @@ -807,7 +834,7 @@ async function inspectRecoverableMeetTab(params: { export async function recoverCurrentMeetTab(params: { config: GoogleMeetConfig; - mode?: "realtime" | "transcribe"; + mode?: GoogleMeetMode; readOnly?: boolean; url?: string; }): Promise<{ @@ -856,7 +883,7 @@ export async function recoverCurrentMeetTab(params: { export async function recoverCurrentMeetTabOnNode(params: { runtime: PluginRuntime; config: GoogleMeetConfig; - mode?: "realtime" | "transcribe"; + mode?: GoogleMeetMode; readOnly?: boolean; url?: string; }): Promise<{ @@ -923,7 +950,7 @@ export async function launchChromeMeetOnNode(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; meetingSessionId: string; - mode: "realtime" | "transcribe"; + mode: GoogleMeetMode; url: string; logger: RuntimeLogger; }): Promise<{ @@ -985,8 +1012,16 @@ export async function launchChromeMeetOnNode(params: { if (!result.bridgeId) { throw new Error("Google Meet node did not return an audio bridge id."); } - const bridge = await startNodeRealtimeAudioBridge({ - config: params.config, + const bridge = await ( + params.mode === "agent" ? startNodeAgentAudioBridge : startNodeRealtimeAudioBridge + )({ + config: + params.mode === "agent" + ? params.config + : { + ...params.config, + realtime: { ...params.config.realtime, strategy: "bidi" }, + }, fullConfig: params.fullConfig, runtime: params.runtime, meetingSessionId: params.meetingSessionId,