diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ad081f8775..451513098ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Docs: https://docs.openclaw.ai - Dependencies: refresh bundled runtime and plugin dependency pins, including Pi 0.71.1, OpenAI 6.35.0, Codex 0.128.0, Zod 4.4.1, and Matrix 41.4.0. Thanks @mariozechner. - Agents/workspace: add `agents.defaults.skipOptionalBootstrapFiles` for skipping selected optional workspace files during bootstrap without disabling required workspace setup. (#62110) Thanks @mainstay22. - Plugins/CLI: add first-class `git:` plugin installs with ref checkout, commit metadata, normal scanner/staging, and `plugins update` support for recorded git sources. Thanks @badlogic. +- Google Meet: add live caption health for Chrome transcribe mode, including caption observer state, transcript counters, last caption text, and recent transcript lines in status and doctor output. Refs #72478. Thanks @DougButdorf. - Voice Call/Google Meet: add Twilio Meet join phase logs around pre-connect DTMF, realtime stream setup, and initial greeting handoff for easier live-call debugging. Thanks @donkeykong91 and @PfanP. - macOS app: move recent session context rows into a Context submenu while keeping usage and cost details root-level, so the menu bar companion stays compact with many active sessions. Thanks @guti. - Gateway/SDK: add SDK-facing tools.invoke RPC with shared HTTP policy, typed approval/refusal results, and SDK helper support. Refs #74705. Thanks @BunsDev and @ai-hpc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index d79084d92c0..45849e3c7bf 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -169,7 +169,12 @@ and will not talk back into the meeting. Chrome joins in this mode also avoid OpenClaw's microphone/camera permission grant and avoid the Meet **Use microphone** path. If Meet shows an audio-choice interstitial, automation tries the no-microphone path and otherwise reports a manual action instead of opening -the local microphone. +the local microphone. In transcribe mode, managed Chrome transports also install +a best-effort Meet caption observer. `googlemeet status --json` and +`googlemeet doctor` surface `captioning`, `captionsEnabledAttempted`, +`transcriptLines`, `lastCaptionAt`, `lastCaptionSpeaker`, `lastCaptionText`, +and a short `recentTranscript` tail so operators can tell whether the browser +joined the call and whether Meet captions are producing text. During realtime sessions, `google_meet` status includes browser and audio bridge health such as `inCall`, `manualActionRequired`, `providerConnected`, @@ -1294,9 +1299,15 @@ openclaw googlemeet doctor ``` Use `mode: "realtime"` for listen/talk-back. `mode: "transcribe"` intentionally -does not start the duplex realtime voice bridge. `googlemeet test-speech` -always checks the realtime path and reports whether bridge output bytes were -observed for that invocation. If `speechOutputVerified` is false and +does not start the duplex realtime voice bridge. For observe-only debugging, +run `openclaw googlemeet status --json ` after participants speak +and check `captioning`, `transcriptLines`, and `lastCaptionText`. If `inCall` is +true but `transcriptLines` stays at `0`, Meet captions may be disabled, no one +has spoken since the observer was installed, the Meet UI changed, or live +captions are unavailable for the meeting language/account. + +`googlemeet test-speech` always checks the realtime path and reports whether +bridge output bytes were observed for that invocation. If `speechOutputVerified` is false and `speechOutputTimedOut` is true, the realtime provider may have accepted the utterance but OpenClaw did not see new output bytes reach the Chrome audio bridge. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index ce71e091be0..0afe9781ab2 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -3,6 +3,7 @@ import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import path from "node:path"; import { PassThrough, Writable } from "node:stream"; +import { createContext, Script } from "node:vm"; import type { RealtimeVoiceProviderPlugin } from "openclaw/plugin-sdk/realtime-voice"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import plugin, { __testing as googleMeetPluginTesting } from "./index.js"; @@ -1635,7 +1636,25 @@ describe("google-meet plugin", () => { const { methods, runCommandWithTimeout } = setup({ defaultMode: "transcribe", }); - const callGatewayFromCli = mockLocalMeetBrowserRequest(); + const callGatewayFromCli = mockLocalMeetBrowserRequest({ + inCall: true, + micMuted: true, + captioning: true, + captionsEnabledAttempted: true, + transcriptLines: 1, + lastCaptionAt: "2026-04-27T10:00:00.000Z", + lastCaptionSpeaker: "Alice", + lastCaptionText: "Can everyone hear the agent?", + recentTranscript: [ + { + at: "2026-04-27T10:00:00.000Z", + speaker: "Alice", + text: "Can everyone hear the agent?", + }, + ], + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }); const handler = methods.get("googlemeet.join") as | ((ctx: { params: Record; @@ -1666,17 +1685,292 @@ describe("google-meet plugin", () => { ([, , request]) => (request as { path?: string }).path === "/permissions/grant", ), ).toBe(false); + expect(respond.mock.calls[0]?.[1]).toMatchObject({ + session: { + chrome: { + health: { + captioning: true, + captionsEnabledAttempted: true, + transcriptLines: 1, + lastCaptionSpeaker: "Alice", + lastCaptionText: "Can everyone hear the agent?", + recentTranscript: [ + { + speaker: "Alice", + text: "Can everyone hear the agent?", + }, + ], + }, + }, + }, + }); const actCall = callGatewayFromCli.mock.calls.find( ([, , request]) => (request as { path?: string }).path === "/act", ); expect(String((actCall?.[2] as { body?: { fn?: string } } | undefined)?.body?.fn)).toContain( "const allowMicrophone = false", ); + expect(String((actCall?.[2] as { body?: { fn?: string } } | undefined)?.body?.fn)).toContain( + "const captureCaptions = true", + ); } finally { Object.defineProperty(process, "platform", { value: originalPlatform }); } }); + it("refreshes observe-only caption health when status is requested", async () => { + let openedTab = false; + let actCount = 0; + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { + path?: string; + body?: { targetId?: string; url?: string }; + }; + if (request.path === "/tabs") { + return openedTab + ? { + tabs: [ + { + targetId: "local-meet-tab", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }, + ], + } + : { tabs: [] }; + } + if (request.path === "/tabs/open") { + openedTab = true; + return { + targetId: "local-meet-tab", + title: "Meet", + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus") { + return { ok: true }; + } + if (request.path === "/act") { + actCount += 1; + return { + result: JSON.stringify( + actCount === 1 + ? { + inCall: true, + captioning: false, + captionsEnabledAttempted: true, + transcriptLines: 0, + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + } + : { + inCall: true, + captioning: true, + captionsEnabledAttempted: true, + transcriptLines: 1, + lastCaptionAt: "2026-04-27T10:00:00.000Z", + lastCaptionSpeaker: "Alice", + lastCaptionText: "Please capture this.", + recentTranscript: [ + { + at: "2026-04-27T10:00:00.000Z", + speaker: "Alice", + text: "Please capture this.", + }, + ], + title: "Meet call", + url: "https://meet.google.com/abc-defg-hij", + }, + ), + }; + } + throw new Error(`unexpected browser request path ${request.path}`); + }, + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup({ + defaultMode: "transcribe", + defaultTransport: "chrome", + }); + + const join = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.join", { + url: "https://meet.google.com/abc-defg-hij", + })) as { session: { id: string; chrome?: { health?: { transcriptLines?: number } } } }; + expect(join.session.chrome?.health?.transcriptLines).toBe(0); + + const status = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.status", { + sessionId: join.session.id, + })) as { + session?: { + chrome?: { + health?: { + captioning?: boolean; + transcriptLines?: number; + lastCaptionText?: string; + }; + }; + }; + }; + + expect(status.session?.chrome?.health).toMatchObject({ + captioning: true, + transcriptLines: 1, + lastCaptionText: "Please capture this.", + }); + expect(callGatewayFromCli).toHaveBeenCalledWith( + "browser.request", + expect.any(Object), + expect.objectContaining({ + method: "POST", + path: "/tabs/focus", + body: { targetId: "local-meet-tab" }, + }), + { progress: false }, + ); + }); + + it("does not mutate realtime browser prompts when status is requested", async () => { + let openedTab = false; + const { methods, nodesInvoke } = setup( + { + defaultMode: "realtime", + defaultTransport: "chrome-node", + }, + { + nodesInvokeHandler: async ({ command, params }) => { + const raw = params as { path?: string; body?: { url?: string; targetId?: string } }; + if (command === "browser.proxy") { + if (raw.path === "/tabs") { + return { payload: { result: { running: true, tabs: [] } } }; + } + if (raw.path === "/tabs/open") { + openedTab = true; + return { + payload: { + result: { + targetId: "tab-1", + title: "Meet", + url: raw.body?.url ?? "https://meet.google.com/abc-defg-hij", + }, + }, + }; + } + if (raw.path === "/tabs/focus" || raw.path === "/permissions/grant") { + return { payload: { result: { ok: true } } }; + } + if (raw.path === "/act") { + return { + payload: { + result: { + ok: true, + targetId: raw.body?.targetId ?? "tab-1", + result: JSON.stringify({ + inCall: false, + manualActionRequired: true, + manualActionReason: "meet-audio-choice-required", + manualActionMessage: "Choose the Meet microphone path manually.", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }), + }, + }, + }; + } + } + if (command === "googlemeet.chrome") { + return { payload: { launched: openedTab } }; + } + throw new Error(`unexpected invoke ${command}`); + }, + }, + ); + + const join = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.join", { + url: "https://meet.google.com/abc-defg-hij", + })) as { session: { id: string } }; + nodesInvoke.mockClear(); + + const status = (await invokeGoogleMeetGatewayMethodForTest(methods, "googlemeet.status", { + sessionId: join.session.id, + })) as { session?: { chrome?: { health?: { manualActionRequired?: boolean } } } }; + + expect(status.session?.chrome?.health?.manualActionRequired).toBe(true); + expect(nodesInvoke).not.toHaveBeenCalledWith( + expect.objectContaining({ command: "browser.proxy" }), + ); + }); + + it("retries caption enable until the captions button is available", () => { + const makeButton = (label: string) => ({ + disabled: false, + innerText: "", + textContent: "", + click: vi.fn(), + getAttribute: vi.fn((name: string) => (name === "aria-label" ? label : null)), + }); + const leaveButton = makeButton("Leave call"); + const captionButton = makeButton("Turn on captions"); + const page = { + buttons: [leaveButton], + }; + const windowState: Record = {}; + const document = { + body: { innerText: "", textContent: "" }, + title: "Meet", + querySelector: vi.fn(() => null), + querySelectorAll: vi.fn((selector: string) => { + if (selector === "button") { + return page.buttons; + } + if (selector === "input") { + return []; + } + return []; + }), + }; + const context = createContext({ + Date, + JSON, + String, + document, + location: { + href: "https://meet.google.com/abc-defg-hij", + hostname: "meet.google.com", + }, + MutationObserver: class { + observe = vi.fn(); + }, + window: windowState, + }); + const inspect = new Script( + `(${chromeTransportTesting.meetStatusScriptForTest({ + allowMicrophone: false, + autoJoin: false, + captureCaptions: true, + guestName: "OpenClaw Agent", + })})`, + ).runInContext(context) as () => string; + + const first = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean }; + const stateAfterFirst = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean }; + expect(first.captionsEnabledAttempted).toBe(false); + expect(stateAfterFirst.enabledAttempted).toBe(false); + expect(captionButton.click).not.toHaveBeenCalled(); + + page.buttons = [leaveButton, captionButton]; + const second = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean }; + const stateAfterSecond = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean }; + expect(second.captionsEnabledAttempted).toBe(true); + expect(stateAfterSecond.enabledAttempted).toBe(true); + expect(captionButton.click).toHaveBeenCalledTimes(1); + }); + it("joins Chrome on a paired node without local Chrome or BlackHole", async () => { const { methods, nodesList, nodesInvoke } = setup( { diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 29a61185c74..866ed0fc76b 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -676,7 +676,7 @@ export default definePluginEntry({ async ({ params, respond }: GatewayRequestHandlerOptions) => { try { const rt = await ensureRuntime(); - respond(true, rt.status(normalizeOptionalString(params?.sessionId))); + respond(true, await rt.status(normalizeOptionalString(params?.sessionId))); } catch (err) { sendError(respond, err); } diff --git a/extensions/google-meet/src/cli.test.ts b/extensions/google-meet/src/cli.test.ts index 98f1c4bee01..75d4ce2abb4 100644 --- a/extensions/google-meet/src/cli.test.ts +++ b/extensions/google-meet/src/cli.test.ts @@ -603,7 +603,7 @@ describe("google-meet CLI", () => { try { await setupCli({ runtime: { - status: () => ({ + status: async () => ({ found: true, sessions: [ { @@ -684,7 +684,7 @@ describe("google-meet CLI", () => { try { await setupCli({ runtime: { - status: () => ({ + status: async () => ({ found: true, session: { id: "meet_1", @@ -703,6 +703,11 @@ describe("google-meet CLI", () => { audioBridge: { type: "node-command-pair", provider: "openai" }, health: { inCall: true, + captioning: true, + transcriptLines: 2, + lastCaptionAt: "2026-04-25T00:00:03.000Z", + lastCaptionSpeaker: "Alice", + lastCaptionText: "Can everyone hear OpenClaw?", providerConnected: true, realtimeReady: true, audioInputActive: true, @@ -720,6 +725,9 @@ describe("google-meet CLI", () => { expect(stdout.output()).toContain("session: meet_1"); expect(stdout.output()).toContain("node: node-1"); expect(stdout.output()).toContain("provider connected: yes"); + expect(stdout.output()).toContain("captioning: yes"); + expect(stdout.output()).toContain("transcript lines: 2"); + expect(stdout.output()).toContain("last caption text: Alice: Can everyone hear OpenClaw?"); expect(stdout.output()).toContain("audio input active: yes"); expect(stdout.output()).toContain("audio output active: no"); } finally { @@ -732,7 +740,7 @@ describe("google-meet CLI", () => { try { await setupCli({ runtime: { - status: () => ({ + status: async () => ({ found: true, session: { id: "meet_1", diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index aef8dfa3a9d..0f317f022d9 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -237,7 +237,7 @@ function formatDuration(value: number | undefined): string { : `${minutes}m ${seconds.toString().padStart(2, "0")}s`; } -function writeDoctorStatus(status: ReturnType): void { +function writeDoctorStatus(status: Awaited>): void { if (!status.found) { writeStdoutLine("Google Meet session: not found"); return; @@ -272,6 +272,10 @@ function writeDoctorStatus(status: ReturnType): voi session.chrome?.audioBridge?.provider ?? session.realtime.provider ?? "n/a", ); writeStdoutLine("in call: %s", formatBoolean(health?.inCall)); + writeStdoutLine("lobby waiting: %s", formatBoolean(health?.lobbyWaiting)); + writeStdoutLine("captioning: %s", formatBoolean(health?.captioning)); + writeStdoutLine("transcript lines: %s", health?.transcriptLines ?? 0); + writeStdoutLine("last caption: %s", formatOptional(health?.lastCaptionAt)); writeStdoutLine("manual action: %s", formatBoolean(health?.manualActionRequired)); if (health?.manualActionRequired) { writeStdoutLine("manual reason: %s", formatOptional(health.manualActionReason)); @@ -298,6 +302,10 @@ function writeDoctorStatus(status: ReturnType): voi ); writeStdoutLine("bridge closed: %s", formatBoolean(health?.bridgeClosed)); writeStdoutLine("browser url: %s", formatOptional(health?.browserUrl)); + if (health?.lastCaptionText) { + const speaker = health.lastCaptionSpeaker ? `${health.lastCaptionSpeaker}: ` : ""; + writeStdoutLine("last caption text: %s%s", speaker, health.lastCaptionText); + } } } @@ -1947,7 +1955,7 @@ export function registerGoogleMeetCli(params: { .option("--json", "Print JSON output", false) .action(async (sessionId?: string) => { const rt = await params.ensureRuntime(); - writeStdoutJson(rt.status(sessionId)); + writeStdoutJson(await rt.status(sessionId)); }); root @@ -1974,7 +1982,7 @@ export function registerGoogleMeetCli(params: { return; } const rt = await params.ensureRuntime(); - const status = rt.status(sessionId); + const status = await rt.status(sessionId); if (options.json) { writeStdoutJson(status); return; diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index 1ee625ff665..2ec1a935df3 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -169,16 +169,23 @@ export class GoogleMeetRuntime { return [...this.#sessions.values()].toSorted((a, b) => a.createdAt.localeCompare(b.createdAt)); } - status(sessionId?: string): { + async status(sessionId?: string): Promise<{ found: boolean; session?: GoogleMeetSession; sessions?: GoogleMeetSession[]; - } { + }> { this.#refreshHealth(sessionId); if (!sessionId) { - return { found: true, sessions: this.list() }; + const sessions = [...this.#sessions.values()].toSorted((a, b) => + a.createdAt.localeCompare(b.createdAt), + ); + await Promise.all(sessions.map((session) => this.#refreshCaptionHealthForSession(session))); + return { found: true, sessions }; } const session = this.#sessions.get(sessionId); + if (session) { + await this.#refreshCaptionHealthForSession(session); + } return session ? { found: true, session } : { found: false }; } @@ -590,8 +597,20 @@ export class GoogleMeetRuntime { }; } + async #refreshCaptionHealthForSession(session: GoogleMeetSession) { + if (session.mode !== "transcribe") { + this.#refreshSpeechReadiness(session); + return; + } + await this.#refreshBrowserHealthForChromeSession(session); + } + async #refreshBrowserHealthForChromeSession(session: GoogleMeetSession) { - if (!isManagedChromeBrowserSession(session) || evaluateSpeechReadiness(session).ready) { + if (!isManagedChromeBrowserSession(session)) { + this.#refreshSpeechReadiness(session); + return; + } + if (session.mode === "realtime" && evaluateSpeechReadiness(session).ready) { this.#refreshSpeechReadiness(session); return; } @@ -601,10 +620,12 @@ export class GoogleMeetRuntime { ? await recoverCurrentMeetTabOnNode({ runtime: this.params.runtime, config: this.params.config, + mode: session.mode, url: session.url, }) : await recoverCurrentMeetTab({ config: this.params.config, + mode: session.mode, url: session.url, }); if (result.found && result.browser && session.chrome) { diff --git a/extensions/google-meet/src/transports/chrome.ts b/extensions/google-meet/src/transports/chrome.ts index f5c6f13f9ec..710bc264fb0 100644 --- a/extensions/google-meet/src/transports/chrome.ts +++ b/extensions/google-meet/src/transports/chrome.ts @@ -43,6 +43,7 @@ export const __testing = { setDepsForTest(deps: { callGatewayFromCli?: typeof callGatewayFromCli } | null) { chromeTransportDeps.callGatewayFromCli = deps?.callGatewayFromCli ?? callGatewayFromCli; }, + meetStatusScriptForTest: meetStatusScript, }; export function outputMentionsBlackHole2ch(output: string): boolean { @@ -209,6 +210,15 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef const parsed = JSON.parse(raw) as { inCall?: boolean; micMuted?: boolean; + lobbyWaiting?: boolean; + leaveReason?: string; + captioning?: boolean; + captionsEnabledAttempted?: boolean; + transcriptLines?: number; + lastCaptionAt?: string; + lastCaptionSpeaker?: string; + lastCaptionText?: string; + recentTranscript?: GoogleMeetChromeHealth["recentTranscript"]; manualActionRequired?: boolean; manualActionReason?: GoogleMeetChromeHealth["manualActionReason"]; manualActionMessage?: string; @@ -219,6 +229,15 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef return { inCall: parsed.inCall, micMuted: parsed.micMuted, + lobbyWaiting: parsed.lobbyWaiting, + leaveReason: parsed.leaveReason, + captioning: parsed.captioning, + captionsEnabledAttempted: parsed.captionsEnabledAttempted, + transcriptLines: parsed.transcriptLines, + lastCaptionAt: parsed.lastCaptionAt, + lastCaptionSpeaker: parsed.lastCaptionSpeaker, + lastCaptionText: parsed.lastCaptionText, + recentTranscript: parsed.recentTranscript, manualActionRequired: parsed.manualActionRequired, manualActionReason: parsed.manualActionReason, manualActionMessage: parsed.manualActionMessage, @@ -306,11 +325,13 @@ async function grantMeetMediaPermissions(params: { function meetStatusScript(params: { allowMicrophone: boolean; autoJoin: boolean; + captureCaptions: boolean; guestName: string; }) { return `() => { const text = (node) => (node?.innerText || node?.textContent || "").trim(); const allowMicrophone = ${JSON.stringify(params.allowMicrophone)}; + const captureCaptions = ${JSON.stringify(params.captureCaptions)}; const buttons = [...document.querySelectorAll('button')]; const notes = []; const findButton = (pattern) => @@ -356,6 +377,95 @@ function meetStatusScript(params: { notes.push("Skipped Meet microphone prompt for observe-only mode."); } const inCall = buttons.some((button) => /leave call/i.test(button.getAttribute('aria-label') || text(button))); + let captioning = false; + let captionsEnabledAttempted = false; + let transcriptLines = 0; + let lastCaptionAt; + let lastCaptionSpeaker; + let lastCaptionText; + let recentTranscript = []; + const captionSelector = '[role="region"][aria-label*="aption" i], [aria-live="polite"][role="region"], div[aria-live="polite"]'; + const captionState = (() => { + if (!captureCaptions) return undefined; + const w = window; + if (!inCall && !w.__openclawMeetCaptions) return undefined; + if (!w.__openclawMeetCaptions) { + w.__openclawMeetCaptions = { + enabledAttempted: false, + observerInstalled: false, + lines: [], + seen: {} + }; + } + return w.__openclawMeetCaptions; + })(); + const recordCaption = (speaker, captionText) => { + if (!captionState) return; + const clean = String(captionText || "").replace(/\\s+/g, " ").trim(); + const cleanSpeaker = String(speaker || "").replace(/\\s+/g, " ").trim(); + if (!clean || clean.length < 2) return; + if (/^(turn on captions|turn off captions|captions)$/i.test(clean)) return; + const key = (cleanSpeaker + "\\n" + clean).toLowerCase(); + if (captionState.seen[key]) return; + captionState.seen[key] = true; + const entry = { at: new Date().toISOString(), speaker: cleanSpeaker || undefined, text: clean }; + captionState.lines.push(entry); + if (captionState.lines.length > 50) captionState.lines.splice(0, captionState.lines.length - 50); + }; + const scrapeCaptions = () => { + if (!captionState) return; + const regions = [...document.querySelectorAll(captionSelector)]; + for (const region of regions) { + const raw = text(region); + if (!raw) continue; + const pieces = raw.split(/\\n+/).map((part) => part.trim()).filter(Boolean); + if (pieces.length >= 2) { + recordCaption(pieces[0], pieces.slice(1).join(" ")); + } else { + recordCaption("", pieces[0] || raw); + } + } + }; + if (captionState) { + if (inCall && !captionState.enabledAttempted) { + const captionButton = findButton(/turn on captions|show captions|captions/i); + const captionLabel = captionButton ? (captionButton.getAttribute("aria-label") || captionButton.getAttribute("data-tooltip") || text(captionButton)) : ""; + if (captionButton) { + captionState.enabledAttempted = true; + captionsEnabledAttempted = true; + if (!/turn off captions|hide captions/i.test(captionLabel)) { + captionButton.click(); + notes.push("Attempted to enable Meet captions for observe-only transcript health."); + } + } + } else if (captionState.enabledAttempted) { + captionsEnabledAttempted = true; + } + if (inCall && !captionState.observerInstalled) { + captionState.observerInstalled = true; + new MutationObserver(scrapeCaptions).observe(document.body, { + childList: true, + subtree: true, + characterData: true + }); + notes.push("Installed Meet caption observer for observe-only transcript health."); + } + if (inCall) { + scrapeCaptions(); + } + const lines = Array.isArray(captionState.lines) ? captionState.lines : []; + const last = lines[lines.length - 1]; + captioning = document.querySelector(captionSelector) !== null || lines.length > 0; + transcriptLines = lines.length; + lastCaptionAt = last?.at; + lastCaptionSpeaker = last?.speaker; + lastCaptionText = last?.text; + recentTranscript = lines.slice(-5); + } + const lobbyWaiting = !inCall && /asking to be let in|you.?ll join when someone lets you in|waiting to be let in|ask to join/i.test(pageText); + const leaveReason = /you left the meeting|you.?ve left the meeting|removed from the meeting|you were removed|call ended|meeting ended/i.test(pageText) + ? pageText.match(/you left the meeting|you.?ve left the meeting|removed from the meeting|you were removed|call ended|meeting ended/i)?.[0] + : undefined; let manualActionReason; let manualActionMessage; if (!inCall && (host === "accounts.google.com" || /use your google account|to continue to google meet|choose an account|sign in to (join|continue)/i.test(pageText))) { @@ -380,6 +490,15 @@ function meetStatusScript(params: { clickedMicrophoneChoice: Boolean(allowMicrophone && microphoneChoice), inCall, micMuted: mic ? /turn on microphone/i.test(mic.getAttribute('aria-label') || text(mic)) : undefined, + lobbyWaiting, + leaveReason, + captioning, + captionsEnabledAttempted, + transcriptLines, + lastCaptionAt, + lastCaptionSpeaker, + lastCaptionText, + recentTranscript, manualActionRequired: Boolean(manualActionReason), manualActionReason, manualActionMessage, @@ -490,6 +609,7 @@ async function openMeetWithBrowserRequest(params: { targetId, fn: meetStatusScript({ allowMicrophone: params.mode === "realtime", + captureCaptions: params.mode === "transcribe", guestName: params.config.chrome.guestName, autoJoin: params.config.chrome.autoJoin, }), @@ -544,10 +664,12 @@ function isRecoverableMeetTab(tab: BrowserTab, url?: string): boolean { async function inspectRecoverableMeetTab(params: { callBrowser: BrowserRequestCaller; config: GoogleMeetConfig; + mode?: "realtime" | "transcribe"; timeoutMs: number; tab: BrowserTab; targetId: string; }) { + const allowMicrophone = params.mode !== "transcribe"; await params.callBrowser({ method: "POST", path: "/tabs/focus", @@ -555,7 +677,7 @@ async function inspectRecoverableMeetTab(params: { timeoutMs: Math.min(params.timeoutMs, 5_000), }); const permissionNotes = await grantMeetMediaPermissions({ - allowMicrophone: true, + allowMicrophone, callBrowser: params.callBrowser, timeoutMs: params.timeoutMs, }); @@ -566,7 +688,8 @@ async function inspectRecoverableMeetTab(params: { kind: "evaluate", targetId: params.targetId, fn: meetStatusScript({ - allowMicrophone: true, + allowMicrophone, + captureCaptions: params.mode === "transcribe", guestName: params.config.chrome.guestName, autoJoin: false, }), @@ -596,6 +719,7 @@ async function inspectRecoverableMeetTab(params: { export async function recoverCurrentMeetTab(params: { config: GoogleMeetConfig; + mode?: "realtime" | "transcribe"; url?: string; }): Promise<{ transport: "chrome"; @@ -631,6 +755,7 @@ export async function recoverCurrentMeetTab(params: { ...(await inspectRecoverableMeetTab({ callBrowser: callLocalBrowserRequest, config: params.config, + mode: params.mode, timeoutMs, tab, targetId, @@ -641,6 +766,7 @@ export async function recoverCurrentMeetTab(params: { export async function recoverCurrentMeetTabOnNode(params: { runtime: PluginRuntime; config: GoogleMeetConfig; + mode?: "realtime" | "transcribe"; url?: string; }): Promise<{ transport: "chrome-node"; @@ -692,6 +818,7 @@ export async function recoverCurrentMeetTabOnNode(params: { timeoutMs: request.timeoutMs, }), config: params.config, + mode: params.mode, timeoutMs, tab, targetId, diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index df2ea75bd5a..575dc94633c 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -28,6 +28,19 @@ export type GoogleMeetSpeechBlockedReason = export type GoogleMeetChromeHealth = { inCall?: boolean; micMuted?: boolean; + lobbyWaiting?: boolean; + leaveReason?: string; + captioning?: boolean; + captionsEnabledAttempted?: boolean; + transcriptLines?: number; + lastCaptionAt?: string; + lastCaptionSpeaker?: string; + lastCaptionText?: string; + recentTranscript?: Array<{ + at?: string; + speaker?: string; + text: string; + }>; manualActionRequired?: boolean; manualActionReason?: GoogleMeetManualActionReason; manualActionMessage?: string;