diff --git a/CHANGELOG.md b/CHANGELOG.md index 91ce5d227c1..37d6480aef2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Google Meet: refresh realtime browser state during status and retry delayed speech after Meet finishes joining, so a just-opened in-call tab no longer leaves speech stuck behind stale `not-in-call` health. - Google Meet: grant Meet media permissions through the Playwright browser context when CDP grants do not affect the attached Chrome page, and report in-call microphone/speaker permission problems instead of marking realtime speech ready. - Google Chat: update the setup example to use the accepted `groups..enabled` key instead of the legacy `allow` alias, with a schema regression for the documented group shape. Thanks @vincentkoc. - Control UI/WebChat: collapse duplicate in-flight internal text sends onto the active Gateway run so rapid repeat submits do not start fresh `agent:main:main` dispatches. Fixes #75737. Thanks @dsdsddd1 and @BunsDev. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 9a41bbc652a..4d1a80470c6 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -2599,6 +2599,112 @@ describe("google-meet plugin", () => { expect(result.details).toMatchObject({ createdSession: true }); }); + it("refreshes realtime browser state in status after a delayed Meet join", async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); + try { + let browserState: Record = { + inCall: false, + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }; + let opened = false; + const callGatewayFromCli = vi.fn( + async ( + _method: string, + _opts: unknown, + params?: unknown, + _extra?: unknown, + ): Promise> => { + const request = params as { + path?: string; + body?: { targetId?: string; url?: string }; + }; + if (request.path === "/tabs") { + return { + tabs: opened + ? [ + { + targetId: "local-meet-tab", + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }, + ] + : [], + }; + } + if (request.path === "/tabs/open") { + opened = true; + return { + targetId: "local-meet-tab", + title: "Meet", + url: request.body?.url ?? "https://meet.google.com/abc-defg-hij", + }; + } + if (request.path === "/tabs/focus" || request.path === "/permissions/grant") { + return { ok: true }; + } + if (request.path === "/act") { + return { result: JSON.stringify(browserState) }; + } + throw new Error(`unexpected browser request path ${request.path}`); + }, + ); + chromeTransportTesting.setDepsForTest({ callGatewayFromCli }); + const { methods } = setup({ + chrome: { + audioBridgeCommand: ["bridge", "start"], + waitForInCallMs: 1, + }, + realtime: { introMessage: "" }, + }); + const join = methods.get("googlemeet.join") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const status = methods.get("googlemeet.status") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const joinRespond = vi.fn(); + const statusRespond = vi.fn(); + + await join?.({ + params: { url: "https://meet.google.com/abc-defg-hij" }, + respond: joinRespond, + }); + expect(joinRespond.mock.calls[0]?.[1]).toMatchObject({ + session: { chrome: { health: { inCall: false } } }, + }); + browserState = { + inCall: true, + micMuted: false, + title: "Meet", + url: "https://meet.google.com/abc-defg-hij", + }; + await status?.({ params: {}, respond: statusRespond }); + + expect(statusRespond.mock.calls[0]?.[1]).toMatchObject({ + sessions: [ + { + chrome: { + health: { + inCall: true, + speechReady: true, + }, + }, + }, + ], + }); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } + }); + it("exposes a test-listen action that proves transcript movement", async () => { const { tools, nodesInvoke } = setup( { diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index 695c6598287..429399f31f7 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -216,12 +216,12 @@ export class GoogleMeetRuntime { const sessions = [...this.#sessions.values()].toSorted((a, b) => a.createdAt.localeCompare(b.createdAt), ); - await Promise.all(sessions.map((session) => this.#refreshCaptionHealthForSession(session))); + await Promise.all(sessions.map((session) => this.#refreshStatusHealthForSession(session))); return { found: true, sessions }; } const session = this.#sessions.get(sessionId); if (session) { - await this.#refreshCaptionHealthForSession(session); + await this.#refreshStatusHealthForSession(session); } return session ? { found: true, session } : { found: false }; } @@ -357,7 +357,7 @@ export class GoogleMeetRuntime { reusable.updatedAt = nowIso(); const spoken = mode === "realtime" && speechInstructions - ? (await this.speak(reusable.id, speechInstructions)).spoken + ? await this.#speakWhenReady(reusable, speechInstructions) : false; return { session: reusable, spoken }; } @@ -506,7 +506,7 @@ export class GoogleMeetRuntime { transport === "twilio" ? delegatedTwilioSpoken : mode === "realtime" && speechInstructions - ? (await this.speak(session.id, speechInstructions)).spoken + ? await this.#speakWhenReady(session, speechInstructions) : false; return { session, spoken }; } @@ -570,6 +570,34 @@ export class GoogleMeetRuntime { return { found: true, spoken: true, session }; } + async #speakWhenReady(session: GoogleMeetSession, instructions: string): Promise { + let result = await this.speak(session.id, instructions); + if (result.spoken || !session.chrome?.audioBridge || session.transport === "twilio") { + return result.spoken; + } + const waitMs = Math.min( + Math.max(0, this.params.config.chrome.waitForInCallMs), + Math.max(0, this.params.config.chrome.joinTimeoutMs), + ); + const deadline = Date.now() + waitMs; + while (Date.now() < deadline) { + await sleep(250); + result = await this.speak(session.id, instructions); + if (result.spoken) { + return true; + } + const health = result.session?.chrome?.health; + if (health?.manualActionRequired || result.session?.state !== "active") { + return false; + } + const blocked = health?.speechBlockedReason; + if (blocked && blocked !== "not-in-call" && blocked !== "browser-unverified") { + return false; + } + } + return false; + } + async testSpeech(request: GoogleMeetJoinRequest): Promise<{ createdSession: boolean; inCall?: boolean; @@ -735,12 +763,27 @@ export class GoogleMeetRuntime { await this.#refreshBrowserHealthForChromeSession(session); } - async #refreshBrowserHealthForChromeSession(session: GoogleMeetSession) { + async #refreshStatusHealthForSession(session: GoogleMeetSession) { + if (session.transport === "chrome" || session.transport === "chrome-node") { + if (session.chrome?.health?.manualActionRequired) { + this.#refreshSpeechReadiness(session); + return; + } + await this.#refreshBrowserHealthForChromeSession(session, { force: true, readOnly: true }); + return; + } + this.#refreshSpeechReadiness(session); + } + + async #refreshBrowserHealthForChromeSession( + session: GoogleMeetSession, + options: { force?: boolean; readOnly?: boolean } = {}, + ) { if (!isManagedChromeBrowserSession(session)) { this.#refreshSpeechReadiness(session); return; } - if (session.mode === "realtime" && evaluateSpeechReadiness(session).ready) { + if (!options.force && session.mode === "realtime" && evaluateSpeechReadiness(session).ready) { this.#refreshSpeechReadiness(session); return; } @@ -751,11 +794,13 @@ export class GoogleMeetRuntime { runtime: this.params.runtime, config: this.params.config, mode: session.mode, + readOnly: options.readOnly, url: session.url, }) : await recoverCurrentMeetTab({ config: this.params.config, mode: session.mode, + readOnly: options.readOnly, url: session.url, }); if (result.found && result.browser && session.chrome) { @@ -775,6 +820,9 @@ export class GoogleMeetRuntime { #refreshSpeechReadiness(session: GoogleMeetSession) { const readiness = evaluateSpeechReadiness(session); + if (readiness.ready) { + session.notes = session.notes.filter((note) => !note.startsWith("Realtime speech blocked:")); + } if (session.chrome) { session.chrome.health = { ...session.chrome.health, diff --git a/extensions/google-meet/src/transports/chrome.ts b/extensions/google-meet/src/transports/chrome.ts index 870999a2bb2..90c71553e44 100644 --- a/extensions/google-meet/src/transports/chrome.ts +++ b/extensions/google-meet/src/transports/chrome.ts @@ -327,11 +327,13 @@ function meetStatusScript(params: { autoJoin: boolean; captureCaptions: boolean; guestName: string; + readOnly?: boolean; }) { return `() => { const text = (node) => (node?.innerText || node?.textContent || "").trim(); const allowMicrophone = ${JSON.stringify(params.allowMicrophone)}; const captureCaptions = ${JSON.stringify(params.captureCaptions)}; + const readOnly = ${JSON.stringify(Boolean(params.readOnly))}; const buttons = [...document.querySelectorAll('button')]; const buttonLabel = (button) => [ @@ -351,7 +353,7 @@ function meetStatusScript(params: { const input = [...document.querySelectorAll('input')].find((el) => /your name/i.test(el.getAttribute('aria-label') || el.placeholder || '') ); - if (${JSON.stringify(params.autoJoin)} && input && !input.value) { + if (!readOnly && ${JSON.stringify(params.autoJoin)} && input && !input.value) { input.focus(); input.value = ${JSON.stringify(params.guestName)}; input.dispatchEvent(new Event('input', { bubbles: true })); @@ -363,20 +365,20 @@ function meetStatusScript(params: { const pageUrl = location.href; const permissionNeeded = /permission needed|microphone problem|speaker problem|allow.*(microphone|camera)|blocked.*(microphone|camera)|permission.*(microphone|camera|speaker)/i.test(permissionText); const mic = buttons.find((button) => /turn off microphone|turn on microphone|microphone/i.test(button.getAttribute('aria-label') || text(button))); - if (!allowMicrophone && mic && /turn off microphone/i.test(mic.getAttribute('aria-label') || text(mic))) { + if (!readOnly && !allowMicrophone && mic && /turn off microphone/i.test(mic.getAttribute('aria-label') || text(mic))) { mic.click(); notes.push("Muted Meet microphone for observe-only mode."); } - const join = ${JSON.stringify(params.autoJoin)} + const join = !readOnly && ${JSON.stringify(params.autoJoin)} ? findButton(/join now|ask to join/i) : null; if (join) join.click(); const microphoneChoice = findButton(/\\buse microphone\\b/i); const noMicrophoneChoice = findButton(/\\b(continue|join|use) without (microphone|mic)\\b|\\bnot now\\b/i); - if (allowMicrophone && microphoneChoice) { + if (!readOnly && allowMicrophone && microphoneChoice) { microphoneChoice.click(); notes.push("Accepted Meet microphone prompt with browser automation."); - } else if (!allowMicrophone && noMicrophoneChoice) { + } else if (!readOnly && !allowMicrophone && noMicrophoneChoice) { noMicrophoneChoice.click(); notes.push("Skipped Meet microphone prompt for observe-only mode."); } @@ -431,7 +433,7 @@ function meetStatusScript(params: { } }; if (captionState) { - if (inCall && !captionState.enabledAttempted) { + if (!readOnly && inCall && !captionState.enabledAttempted) { const captionButton = findButton(/turn on captions|show captions|captions/i); const captionLabel = captionButton ? (captionButton.getAttribute("aria-label") || captionButton.getAttribute("data-tooltip") || text(captionButton)) : ""; if (captionButton) { @@ -669,6 +671,7 @@ async function inspectRecoverableMeetTab(params: { callBrowser: BrowserRequestCaller; config: GoogleMeetConfig; mode?: "realtime" | "transcribe"; + readOnly?: boolean; timeoutMs: number; tab: BrowserTab; targetId: string; @@ -680,11 +683,13 @@ async function inspectRecoverableMeetTab(params: { body: { targetId: params.targetId }, timeoutMs: Math.min(params.timeoutMs, 5_000), }); - const permissionNotes = await grantMeetMediaPermissions({ - allowMicrophone, - callBrowser: params.callBrowser, - timeoutMs: params.timeoutMs, - }); + const permissionNotes = params.readOnly + ? [] + : await grantMeetMediaPermissions({ + allowMicrophone, + callBrowser: params.callBrowser, + timeoutMs: params.timeoutMs, + }); const evaluated = await params.callBrowser({ method: "POST", path: "/act", @@ -696,6 +701,7 @@ async function inspectRecoverableMeetTab(params: { captureCaptions: params.mode === "transcribe", guestName: params.config.chrome.guestName, autoJoin: false, + readOnly: params.readOnly, }), }, timeoutMs: Math.min(params.timeoutMs, 10_000), @@ -724,6 +730,7 @@ async function inspectRecoverableMeetTab(params: { export async function recoverCurrentMeetTab(params: { config: GoogleMeetConfig; mode?: "realtime" | "transcribe"; + readOnly?: boolean; url?: string; }): Promise<{ transport: "chrome"; @@ -760,6 +767,7 @@ export async function recoverCurrentMeetTab(params: { callBrowser: callLocalBrowserRequest, config: params.config, mode: params.mode, + readOnly: params.readOnly, timeoutMs, tab, targetId, @@ -771,6 +779,7 @@ export async function recoverCurrentMeetTabOnNode(params: { runtime: PluginRuntime; config: GoogleMeetConfig; mode?: "realtime" | "transcribe"; + readOnly?: boolean; url?: string; }): Promise<{ transport: "chrome-node"; @@ -823,6 +832,7 @@ export async function recoverCurrentMeetTabOnNode(params: { }), config: params.config, mode: params.mode, + readOnly: params.readOnly, timeoutMs, tab, targetId,