From 9ddfe52ff90f93cc63013e578a1ad55fff888c62 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 07:52:11 +0100 Subject: [PATCH] fix: prove Google Meet listen health (#74824) --- CHANGELOG.md | 1 + docs/plugins/google-meet.md | 27 +++- extensions/google-meet/index.test.ts | 70 +++++++++++ extensions/google-meet/index.ts | 32 ++++- extensions/google-meet/src/cli.test.ts | 49 ++++++++ extensions/google-meet/src/cli.ts | 28 +++++ extensions/google-meet/src/runtime.ts | 119 ++++++++++++++++++ .../google-meet/src/transports/types.ts | 1 + 8 files changed, 324 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dff17f821da..2efb2b4a62e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Plugins/beta: prepare Brave, Codex, Feishu, Synology Chat, Tlon, and Twitch for `2026.5.1-beta.1` npm and ClawHub publishing. Thanks @vincentkoc. - Providers/xAI: add Grok 4.3 to the bundled catalog and make it the default xAI chat model. - Google Meet: let API-created rooms set `accessType` and `entryPointAccess`, and add `googlemeet end-active-conference` for closing managed spaces after a call. (#74824) Thanks @BsnizND. +- Google Meet: add `googlemeet test-listen` and the matching `google_meet` `test_listen` action so transcribe-mode joins wait for real caption or transcript movement before reporting listen-first health. Refs #72478. Thanks @DougButdorf. - Plugins/ClawHub: prefer versioned ClawPack artifacts when ClawHub publishes digest metadata, verifying the ClawPack response header and downloaded bytes before installing. Thanks @vincentkoc. - Plugins/ClawHub: persist ClawPack digest metadata on ClawHub plugin install and update records so registry refreshes and download verification can reuse stored artifact facts. Thanks @vincentkoc. - Plugins/ClawHub: allow official bundled-plugin cutovers to prefer ClawHub installs with npm fallback only when the ClawHub package or version is absent. Thanks @vincentkoc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index f68cb3851cc..4846b371b49 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -193,6 +193,10 @@ a best-effort Meet caption observer. `googlemeet status --json` and `transcriptLines`, `lastCaptionAt`, `lastCaptionSpeaker`, `lastCaptionText`, and a short `recentTranscript` tail so operators can tell whether the browser joined the call and whether Meet captions are producing text. +Use `openclaw googlemeet test-listen --transport chrome-node` when +you need a yes/no probe: it joins in transcribe mode, waits for fresh caption or +transcript movement, and returns `listenVerified`, `listenTimedOut`, manual +action fields, and the latest caption health. During realtime sessions, `google_meet` status includes browser and audio bridge health such as `inCall`, `manualActionRequired`, `providerConnected`, @@ -820,6 +824,18 @@ And they can end the active conference for a known room: } ``` +For listen-first validation, agents should use `test_listen` before claiming the +meeting is useful: + +```json +{ + "action": "test_listen", + "url": "https://meet.google.com/abc-defg-hij", + "transport": "chrome-node", + "timeoutMs": 30000 +} +``` + Run the guarded live smoke against a real retained meeting: ```bash @@ -828,6 +844,14 @@ OPENCLAW_GOOGLE_MEET_LIVE_MEETING=https://meet.google.com/abc-defg-hij \ pnpm test:live -- extensions/google-meet/google-meet.live.test.ts ``` +Run the live listen-first browser probe against a meeting where someone will +speak with Meet captions available: + +```bash +openclaw googlemeet setup --transport chrome-node --mode transcribe +openclaw googlemeet test-listen https://meet.google.com/abc-defg-hij --transport chrome-node --timeout-ms 30000 +``` + Live smoke environment: - `OPENCLAW_LIVE_TEST=1` enables guarded live tests. @@ -1297,7 +1321,8 @@ openclaw nodes status --connected ### Browser opens but agent cannot join -Run `googlemeet test-speech` and inspect the returned Chrome health. If it +Run `googlemeet test-listen` for observe-only joins or `googlemeet test-speech` +for realtime joins, then inspect the returned Chrome health. If either probe reports `manualActionRequired: true`, show `manualActionMessage` to the operator and stop retrying until the browser action is complete. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index f4ace2358bf..96a0c630a37 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -560,6 +560,7 @@ describe("google-meet plugin", () => { "end_active_conference", "speak", "test_speech", + "test_listen", ], description: expect.stringContaining("recover_current_tab"), }, @@ -2395,6 +2396,52 @@ describe("google-meet plugin", () => { expect(result.details).toMatchObject({ createdSession: true }); }); + it("exposes a test-listen action that proves transcript movement", async () => { + const { tools, nodesInvoke } = setup( + { + defaultTransport: "chrome-node", + }, + { + browserActResult: { + inCall: true, + captioning: true, + transcriptLines: 1, + lastCaptionText: "hello from the meeting", + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }, + nodesInvokeResult: { + payload: { + launched: true, + }, + }, + }, + ); + const tool = tools[0] as { + execute: ( + id: string, + params: unknown, + ) => Promise<{ details: { listenVerified?: boolean; transcriptLines?: number } }>; + }; + + const result = await tool.execute("id", { + action: "test_listen", + url: "https://meet.google.com/abc-defg-hij", + timeoutMs: 100, + }); + + expect(nodesInvoke).toHaveBeenCalledWith( + expect.objectContaining({ + command: "googlemeet.chrome", + params: expect.objectContaining({ + action: "start", + mode: "transcribe", + }), + }), + ); + expect(result.details).toMatchObject({ listenVerified: true, transcriptLines: 1 }); + }); + it("does not start a second realtime response for test speech", async () => { const runtime = new GoogleMeetRuntime({ config: resolveGoogleMeetConfig({}), @@ -2456,6 +2503,29 @@ describe("google-meet plugin", () => { ).rejects.toThrow("test_speech requires mode: realtime"); }); + it("rejects realtime and Twilio modes for test listen", async () => { + const runtime = new GoogleMeetRuntime({ + config: resolveGoogleMeetConfig({}), + fullConfig: {} as never, + runtime: {} as never, + logger: noopLogger, + }); + + await expect( + runtime.testListen({ + url: "https://meet.google.com/abc-defg-hij", + mode: "realtime", + }), + ).rejects.toThrow("test_listen requires mode: transcribe"); + + await expect( + runtime.testListen({ + url: "https://meet.google.com/abc-defg-hij", + transport: "twilio", + }), + ).rejects.toThrow("test_listen supports chrome or chrome-node"); + }); + it("reports manual action when the browser profile needs Google login", async () => { const { tools } = setup( { diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index a45a973f86e..b2662622328 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -205,6 +205,7 @@ const GoogleMeetToolSchema = Type.Object({ "end_active_conference", "speak", "test_speech", + "test_listen", ], description: "Google Meet action to run. create creates and joins by default; pass join=false to only mint a URL. After a timeout or unclear browser state, call recover_current_tab before retrying join.", @@ -243,6 +244,7 @@ const GoogleMeetToolSchema = Type.Object({ dtmfSequence: Type.Optional(Type.String({ description: "Explicit DTMF sequence for Twilio" })), sessionId: Type.Optional(Type.String({ description: "Meet session ID" })), message: Type.Optional(Type.String({ description: "Realtime instructions to speak now" })), + timeoutMs: Type.Optional(Type.Number({ description: "Probe timeout in milliseconds" })), meeting: Type.Optional(Type.String({ description: "Meet URL, meeting code, or spaces/{id}" })), today: Type.Optional( Type.Boolean({ @@ -360,7 +362,8 @@ type GoogleMeetGatewayToolAction = | "leave" | "end_active_conference" | "speak" - | "test_speech"; + | "test_speech" + | "test_listen"; function googleMeetGatewayMethodForToolAction(action: GoogleMeetGatewayToolAction): string { switch (action) { @@ -370,6 +373,8 @@ function googleMeetGatewayMethodForToolAction(action: GoogleMeetGatewayToolActio return "googlemeet.setup"; case "test_speech": return "googlemeet.testSpeech"; + case "test_listen": + return "googlemeet.testListen"; case "end_active_conference": return "googlemeet.endActiveConference"; default: @@ -917,11 +922,29 @@ export default definePluginEntry({ }, ); + api.registerGatewayMethod( + "googlemeet.testListen", + async ({ params, respond }: GatewayRequestHandlerOptions) => { + try { + const rt = await ensureRuntime(); + const result = await rt.testListen({ + url: resolveMeetingInput(config, params?.url), + transport: normalizeTransport(params?.transport), + mode: normalizeMode(params?.mode), + timeoutMs: typeof params?.timeoutMs === "number" ? params.timeoutMs : undefined, + }); + respond(true, result); + } catch (err) { + sendError(respond, err); + } + }, + ); + api.registerTool({ name: "google_meet", label: "Google Meet", description: - "Join and track Google Meet sessions through Chrome or Twilio. Call setup_status before join/create/test_speech; if it reports a Chrome node offline or local audio missing, surface that blocker instead of retrying or switching transports. Offline nodes are diagnostics only, not usable candidates. If a Meet tab is already open after a timeout, call recover_current_tab before retrying join to report login, permission, or admission blockers without opening another tab.", + "Join and track Google Meet sessions through Chrome or Twilio. Call setup_status before join/create/test_listen/test_speech; if it reports a Chrome node offline or local audio missing, surface that blocker instead of retrying or switching transports. Offline nodes are diagnostics only, not usable candidates. If a Meet tab is already open after a timeout, call recover_current_tab before retrying join to report login, permission, or admission blockers without opening another tab.", parameters: GoogleMeetToolSchema, async execute(_toolCallId, params) { const raw = asParamRecord(params); @@ -938,6 +961,11 @@ export default definePluginEntry({ await callGoogleMeetGatewayFromTool({ config, action: "test_speech", raw }), ); } + case "test_listen": { + return json( + await callGoogleMeetGatewayFromTool({ config, action: "test_listen", raw }), + ); + } case "status": { return json(await callGoogleMeetGatewayFromTool({ config, action: "status", raw })); } diff --git a/extensions/google-meet/src/cli.test.ts b/extensions/google-meet/src/cli.test.ts index 6c3e441f51d..b55d3bd59f3 100644 --- a/extensions/google-meet/src/cli.test.ts +++ b/extensions/google-meet/src/cli.test.ts @@ -689,6 +689,55 @@ describe("google-meet CLI", () => { } }); + it("runs a listen-first health probe", async () => { + const testListen = vi.fn(async () => ({ + createdSession: true, + listenVerified: true, + listenTimedOut: false, + transcriptLines: 1, + session: { + id: "meet_1", + url: "https://meet.google.com/abc-defg-hij", + state: "active", + transport: "chrome-node", + mode: "transcribe", + participantIdentity: "signed-in Google Chrome profile on a paired node", + createdAt: "2026-04-25T00:00:00.000Z", + updatedAt: "2026-04-25T00:00:01.000Z", + realtime: { enabled: false, provider: "openai", toolPolicy: "safe-read-only" }, + notes: [], + }, + })); + const stdout = captureStdout(); + try { + await setupCli({ + runtime: { testListen }, + }).parseAsync( + [ + "googlemeet", + "test-listen", + "https://meet.google.com/abc-defg-hij", + "--transport", + "chrome-node", + "--timeout-ms", + "30000", + ], + { from: "user" }, + ); + expect(testListen).toHaveBeenCalledWith({ + url: "https://meet.google.com/abc-defg-hij", + transport: "chrome-node", + timeoutMs: 30000, + }); + expect(JSON.parse(stdout.output())).toMatchObject({ + listenVerified: true, + transcriptLines: 1, + }); + } finally { + stdout.restore(); + } + }); + it("prints a dry-run export manifest without writing files", async () => { stubMeetArtifactsApi(); const stdout = captureStdout(); diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index 7073712dae0..e2911ec6261 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -37,6 +37,7 @@ type JoinOptions = { transport?: GoogleMeetTransport; mode?: GoogleMeetMode; message?: string; + timeoutMs?: string; dialInNumber?: string; pin?: string; dtmfSequence?: string; @@ -228,6 +229,17 @@ function formatOptional(value: unknown): string { return typeof value === "string" && value.trim() ? value : "n/a"; } +function parsePositiveNumber(value: string | undefined, label: string): number | undefined { + if (value === undefined) { + return undefined; + } + const parsed = Number(value); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error(`${label} must be a positive number`); + } + return parsed; +} + function formatDuration(value: number | undefined): string { if (value === undefined) { return "n/a"; @@ -1567,6 +1579,22 @@ export function registerGoogleMeetCli(params: { ); }); + root + .command("test-listen") + .argument("[url]", "Explicit https://meet.google.com/... URL") + .option("--transport ", "Transport: chrome or chrome-node") + .option("--timeout-ms ", "How long to wait for fresh captions/transcript movement") + .action(async (url: string | undefined, options: JoinOptions) => { + const rt = await params.ensureRuntime(); + writeStdoutJson( + await rt.testListen({ + url: resolveMeetingInput(params.config, url), + transport: options.transport, + timeoutMs: parsePositiveNumber(options.timeoutMs, "timeout-ms"), + }), + ); + }); + root .command("resolve-space") .description("Resolve a Meet URL, meeting code, or spaces/{id} to its canonical space") diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index 2ec1a935df3..5367cef4c4a 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -66,6 +66,43 @@ function hasRealtimeAudioOutputAdvanced( return (health?.lastOutputBytes ?? 0) > startOutputBytes; } +type TranscriptCheckpoint = { + lines: number; + lastCaptionAt?: string; + lastCaptionText?: string; +}; + +function transcriptCheckpoint(health: GoogleMeetChromeHealth | undefined): TranscriptCheckpoint { + return { + lines: health?.transcriptLines ?? 0, + lastCaptionAt: health?.lastCaptionAt, + lastCaptionText: health?.lastCaptionText, + }; +} + +function hasTranscriptAdvanced( + health: GoogleMeetChromeHealth | undefined, + start: TranscriptCheckpoint, +): boolean { + if ((health?.transcriptLines ?? 0) > start.lines) { + return true; + } + if (health?.lastCaptionAt && health.lastCaptionAt !== start.lastCaptionAt) { + return true; + } + return Boolean(health?.lastCaptionText && health.lastCaptionText !== start.lastCaptionText); +} + +function resolveProbeTimeoutMs(input: number | undefined, fallback: number): number { + if (input === undefined) { + return Math.min(Math.max(fallback, 1), 120_000); + } + if (!Number.isFinite(input) || input <= 0) { + throw new Error("timeoutMs must be a positive number"); + } + return Math.min(Math.trunc(input), 120_000); +} + function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -597,6 +634,88 @@ export class GoogleMeetRuntime { }; } + async testListen(request: GoogleMeetJoinRequest): Promise<{ + createdSession: boolean; + inCall?: boolean; + manualActionRequired?: boolean; + manualActionReason?: GoogleMeetChromeHealth["manualActionReason"]; + manualActionMessage?: string; + listenVerified: boolean; + listenTimedOut: boolean; + captioning?: boolean; + captionsEnabledAttempted?: boolean; + transcriptLines?: number; + lastCaptionAt?: string; + lastCaptionSpeaker?: string; + lastCaptionText?: string; + recentTranscript?: GoogleMeetChromeHealth["recentTranscript"]; + session: GoogleMeetSession; + }> { + if (request.mode === "realtime") { + throw new Error( + "test_listen requires mode: transcribe; use test_speech for realtime talk-back.", + ); + } + const url = normalizeMeetUrl(request.url); + const transport = resolveTransport(request.transport, this.params.config); + if (transport === "twilio") { + throw new Error("test_listen supports chrome or chrome-node transports"); + } + const beforeSessions = this.list(); + const before = new Set(beforeSessions.map((session) => session.id)); + const existingSession = beforeSessions.find( + (session) => + session.state === "active" && + isSameMeetUrlForReuse(session.url, url) && + session.transport === transport && + session.mode === "transcribe", + ); + const start = transcriptCheckpoint(existingSession?.chrome?.health); + const result = await this.join({ + ...request, + transport, + url, + mode: "transcribe", + message: undefined, + }); + let health = result.session.chrome?.health; + const timeoutMs = resolveProbeTimeoutMs( + request.timeoutMs, + this.params.config.chrome.joinTimeoutMs, + ); + const shouldWait = + health?.manualActionRequired !== true && isManagedChromeBrowserSession(result.session); + if (shouldWait && !hasTranscriptAdvanced(health, start)) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + await sleep(250); + await this.#refreshCaptionHealthForSession(result.session); + health = result.session.chrome?.health; + if (health?.manualActionRequired || hasTranscriptAdvanced(health, start)) { + break; + } + } + } + const listenVerified = hasTranscriptAdvanced(health, start); + return { + createdSession: !before.has(result.session.id), + inCall: health?.inCall, + manualActionRequired: health?.manualActionRequired, + manualActionReason: health?.manualActionReason, + manualActionMessage: health?.manualActionMessage, + listenVerified, + listenTimedOut: shouldWait && !listenVerified && health?.manualActionRequired !== true, + captioning: health?.captioning, + captionsEnabledAttempted: health?.captionsEnabledAttempted, + transcriptLines: health?.transcriptLines, + lastCaptionAt: health?.lastCaptionAt, + lastCaptionSpeaker: health?.lastCaptionSpeaker, + lastCaptionText: health?.lastCaptionText, + recentTranscript: health?.recentTranscript, + session: result.session, + }; + } + async #refreshCaptionHealthForSession(session: GoogleMeetSession) { if (session.mode !== "transcribe") { this.#refreshSpeechReadiness(session); diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 3624dfd287b..1dbc9e04808 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -7,6 +7,7 @@ export type GoogleMeetJoinRequest = { transport?: GoogleMeetTransport; mode?: GoogleMeetMode; message?: string; + timeoutMs?: number; dialInNumber?: string; pin?: string; dtmfSequence?: string;