diff --git a/CHANGELOG.md b/CHANGELOG.md index ca25ed400ae..bffbde9785d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,7 @@ Docs: https://docs.openclaw.ai - Slack/Matrix: avoid creating blank progress-draft messages when `streaming.progress.label=false` and progress tool lines are disabled. Thanks @vincentkoc. - QA/Matrix: keep the mock OpenAI tool-progress provider aligned with exact-marker Matrix prompts so the hardened live preview scenario still forces a deterministic read before final delivery. Thanks @vincentkoc. - Google Meet: make realtime talk-back agent-driven by default with `realtime.strategy: "agent"`, keep the previous direct bidirectional model behavior available as `realtime.strategy: "bidi"`, route the Meet tab speaker output to `BlackHole 2ch` automatically for local Chrome realtime joins, coalesce nearby speech transcript fragments before consulting the agent, and avoid cutting off agent speech from server VAD or stale playback pipe errors. +- Google Meet: suppress queued assistant playback and assistant-like transcript echoes from the realtime input path, so the meeting does not hear the agent's own speech as a new user turn and loop or cut itself off. - OpenAI/Google Meet: wait for realtime voice `session.updated` before treating the bridge as connected, so Meet joins do not return with audio queued behind an unconfigured realtime session. Thanks @vincentkoc. - Plugins/catalog: merge official external catalog descriptors into partial package channel config metadata, so lagging WeCom/Yuanbao manifests keep their own schema while still exposing host-supplied labels and setup text. Thanks @vincentkoc. - Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 4433f1f0ef5..d90ed14ce44 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -1182,6 +1182,9 @@ configured OpenClaw agent before speaking. Set `realtime.strategy: "bidi"` when you want the realtime model to answer directly. Nearby final transcript fragments are coalesced before the consult so one spoken turn does not produce several stale partial answers. +Realtime input is also suppressed while queued assistant audio is still playing, +and recent assistant-like transcript echoes are ignored before the agent consult +so BlackHole loopback does not make the agent answer its own speech. | Strategy | Who decides the answer | Context behavior | Use when | | -------- | ----------------------------- | ------------------------------------------------------------------------------------ | ----------------------------------------------------- | diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 593217baa61..7ddead7e4b5 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -24,7 +24,11 @@ import { } from "./src/meet.js"; import { handleGoogleMeetNodeHostCommand } from "./src/node-host.js"; import { startNodeRealtimeAudioBridge } from "./src/realtime-node.js"; -import { startCommandRealtimeAudioBridge } from "./src/realtime.js"; +import { + extendGoogleMeetOutputEchoSuppression, + isGoogleMeetLikelyAssistantEchoTranscript, + startCommandRealtimeAudioBridge, +} from "./src/realtime.js"; import { GoogleMeetRuntime, normalizeMeetUrl } from "./src/runtime.js"; import { invokeGoogleMeetGatewayMethodForTest, @@ -3766,6 +3770,60 @@ describe("google-meet plugin", () => { await handle.stop(); }); + it("tracks queued playback time when suppressing realtime input echo", () => { + const first = extendGoogleMeetOutputEchoSuppression({ + audio: Buffer.alloc(48_000), + audioFormat: "pcm16-24khz", + nowMs: 1_000, + lastOutputPlayableUntilMs: 0, + suppressInputUntilMs: 0, + }); + const second = extendGoogleMeetOutputEchoSuppression({ + audio: Buffer.alloc(48_000), + audioFormat: "pcm16-24khz", + nowMs: 1_100, + lastOutputPlayableUntilMs: first.lastOutputPlayableUntilMs, + suppressInputUntilMs: first.suppressInputUntilMs, + }); + + expect(first).toMatchObject({ + durationMs: 1_000, + lastOutputPlayableUntilMs: 2_000, + suppressInputUntilMs: 5_000, + }); + expect(second).toMatchObject({ + durationMs: 1_000, + lastOutputPlayableUntilMs: 3_000, + suppressInputUntilMs: 6_000, + }); + }); + + it("detects assistant transcript echoes before agent consult", () => { + const nowMs = Date.parse("2026-05-04T01:00:00.000Z"); + const transcript = [ + { + at: new Date(nowMs - 1_000).toISOString(), + role: "assistant" as const, + text: "Hi Molty, glad to have you here. Let me know if there's anything specific you'd like to cover or if you need any support during the meeting.", + }, + ]; + + expect( + isGoogleMeetLikelyAssistantEchoTranscript({ + transcript, + text: "Let me know if there's anything specific you'd like to cover or if you need any support during the", + nowMs, + }), + ).toBe(true); + expect( + isGoogleMeetLikelyAssistantEchoTranscript({ + transcript, + text: "Tell me a story.", + nowMs, + }), + ).toBe(false); + }); + it("uses a local barge-in input command to clear active Chrome playback", async () => { let callbacks: | { diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index 4fc1dfd37ec..f44d02c6d77 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -17,11 +17,13 @@ import { getGoogleMeetRealtimeTranscriptHealth, buildGoogleMeetSpeakExactUserMessage, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS, + extendGoogleMeetOutputEchoSuppression, getGoogleMeetRealtimeEventHealth, recordGoogleMeetRealtimeTranscript, recordGoogleMeetRealtimeEvent, resolveGoogleMeetRealtimeAudioFormat, resolveGoogleMeetRealtimeProvider, + isGoogleMeetLikelyAssistantEchoTranscript, type GoogleMeetRealtimeEventEntry, type GoogleMeetRealtimeTranscriptEntry, } from "./realtime.js"; @@ -65,6 +67,10 @@ export async function startNodeRealtimeAudioBridge(params: { let lastClearAt: string | undefined; let lastInputBytes = 0; let lastOutputBytes = 0; + let suppressedInputBytes = 0; + let lastSuppressedInputAt: string | undefined; + let suppressInputUntil = 0; + let lastOutputPlayableUntilMs = 0; let consecutiveInputErrors = 0; let lastInputError: string | undefined; let clearCount = 0; @@ -199,6 +205,15 @@ export async function startNodeRealtimeAudioBridge(params: { audioSink: { isOpen: () => !stopped, sendAudio: (audio) => { + const suppression = extendGoogleMeetOutputEchoSuppression({ + audio, + audioFormat: params.config.chrome.audioFormat, + nowMs: Date.now(), + lastOutputPlayableUntilMs, + suppressInputUntilMs: suppressInputUntil, + }); + suppressInputUntil = suppression.suppressInputUntilMs; + lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs; lastOutputAt = new Date().toISOString(); lastOutputBytes += audio.byteLength; void params.runtime.nodes @@ -222,6 +237,8 @@ export async function startNodeRealtimeAudioBridge(params: { clearAudio: () => { lastClearAt = new Date().toISOString(); clearCount += 1; + suppressInputUntil = 0; + lastOutputPlayableUntilMs = 0; void params.runtime.nodes .invoke({ nodeId: params.nodeId, @@ -245,6 +262,12 @@ export async function startNodeRealtimeAudioBridge(params: { recordGoogleMeetRealtimeTranscript(transcript, role, text); params.logger.info(`[google-meet] node realtime ${role}: ${text}`); if (role === "user" && strategy === "agent") { + if (isGoogleMeetLikelyAssistantEchoTranscript({ transcript, text })) { + params.logger.info( + `[google-meet] node realtime ignored assistant echo transcript: ${text}`, + ); + return; + } enqueueAgentConsultForUserTranscript(text); } } @@ -332,6 +355,11 @@ export async function startNodeRealtimeAudioBridge(params: { const base64 = readString(result.base64); if (base64) { const audio = Buffer.from(base64, "base64"); + if (Date.now() < suppressInputUntil) { + lastSuppressedInputAt = new Date().toISOString(); + suppressedInputBytes += audio.byteLength; + continue; + } lastInputAt = new Date().toISOString(); lastInputBytes += audio.byteLength; bridge?.sendAudio(audio); @@ -372,9 +400,11 @@ export async function startNodeRealtimeAudioBridge(params: { audioOutputActive: lastOutputBytes > 0, lastInputAt, lastOutputAt, + lastSuppressedInputAt, lastClearAt, lastInputBytes, lastOutputBytes, + suppressedInputBytes, ...getGoogleMeetRealtimeTranscriptHealth(transcript), ...getGoogleMeetRealtimeEventHealth(realtimeEvents), consecutiveInputErrors, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 2a54ae74bc7..33deff1a892 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -100,6 +100,8 @@ export type GoogleMeetRealtimeEventEntry = RealtimeVoiceBridgeEvent & { }; export const GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS = 900; +export const GOOGLE_MEET_OUTPUT_ECHO_SUPPRESSION_TAIL_MS = 3_000; +export const GOOGLE_MEET_TRANSCRIPT_ECHO_LOOKBACK_MS = 45_000; export function recordGoogleMeetRealtimeEvent( events: GoogleMeetRealtimeEventEntry[], @@ -157,6 +159,80 @@ function readPcm16Stats(audio: Buffer): { rms: number; peak: number } { }; } +function normalizeTranscriptForEchoMatch(text: string): string[] { + return text + .toLowerCase() + .replace(/['’]/g, "") + .replace(/[^a-z0-9]+/g, " ") + .trim() + .split(/\s+/) + .filter((token) => token.length > 1); +} + +function hasMeaningfulEchoOverlap(userTokens: string[], assistantTokens: string[]): boolean { + if (userTokens.length < 4 || assistantTokens.length < 4) { + return false; + } + const assistantTokenSet = new Set(assistantTokens); + const overlap = userTokens.filter((token) => assistantTokenSet.has(token)).length; + return overlap / userTokens.length >= 0.58; +} + +export function isGoogleMeetLikelyAssistantEchoTranscript(params: { + transcript: GoogleMeetRealtimeTranscriptEntry[]; + text: string; + nowMs?: number; +}): boolean { + const userTokens = normalizeTranscriptForEchoMatch(params.text); + if (userTokens.length < 4) { + return false; + } + const nowMs = params.nowMs ?? Date.now(); + const recentAssistantText = params.transcript + .filter((entry) => { + if (entry.role !== "assistant") { + return false; + } + const at = Date.parse(entry.at); + return Number.isFinite(at) && nowMs - at <= GOOGLE_MEET_TRANSCRIPT_ECHO_LOOKBACK_MS; + }) + .slice(-6) + .map((entry) => entry.text) + .join(" "); + if (!recentAssistantText.trim()) { + return false; + } + const userNormalized = userTokens.join(" "); + const assistantTokens = normalizeTranscriptForEchoMatch(recentAssistantText); + const assistantNormalized = assistantTokens.join(" "); + return ( + (userNormalized.length >= 18 && assistantNormalized.includes(userNormalized)) || + (assistantNormalized.length >= 18 && userNormalized.includes(assistantNormalized)) || + hasMeaningfulEchoOverlap(userTokens, assistantTokens) + ); +} + +export function extendGoogleMeetOutputEchoSuppression(params: { + audio: Buffer; + audioFormat: GoogleMeetConfig["chrome"]["audioFormat"]; + nowMs: number; + lastOutputPlayableUntilMs: number; + suppressInputUntilMs: number; +}): { lastOutputPlayableUntilMs: number; suppressInputUntilMs: number; durationMs: number } { + const bytesPerMs = params.audioFormat === "g711-ulaw-8khz" ? 8 : 48; + const durationMs = Math.ceil(params.audio.byteLength / bytesPerMs); + const playbackStartMs = Math.max(params.nowMs, params.lastOutputPlayableUntilMs); + const playbackEndMs = playbackStartMs + durationMs; + return { + durationMs, + lastOutputPlayableUntilMs: playbackEndMs, + suppressInputUntilMs: Math.max( + params.suppressInputUntilMs, + playbackEndMs + GOOGLE_MEET_OUTPUT_ECHO_SUPPRESSION_TAIL_MS, + ), + }; +} + export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) { return config.chrome.audioFormat === "g711-ulaw-8khz" ? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ @@ -227,11 +303,15 @@ export async function startCommandRealtimeAudioBridge(params: { let agentConsultDebounceTimer: ReturnType | undefined; const suppressInputForOutput = (audio: Buffer) => { - const bytesPerMs = params.config.chrome.audioFormat === "g711-ulaw-8khz" ? 8 : 48; - const durationMs = Math.ceil(audio.byteLength / bytesPerMs); - const until = Date.now() + durationMs + 900; - suppressInputUntil = Math.max(suppressInputUntil, until); - lastOutputPlayableUntilMs = Math.max(lastOutputPlayableUntilMs, until); + const suppression = extendGoogleMeetOutputEchoSuppression({ + audio, + audioFormat: params.config.chrome.audioFormat, + nowMs: Date.now(), + lastOutputPlayableUntilMs, + suppressInputUntilMs: suppressInputUntil, + }); + suppressInputUntil = suppression.suppressInputUntilMs; + lastOutputPlayableUntilMs = suppression.lastOutputPlayableUntilMs; }; const terminateProcess = (proc: BridgeProcess, signal: NodeJS.Signals = "SIGTERM") => { @@ -521,6 +601,10 @@ export async function startCommandRealtimeAudioBridge(params: { recordGoogleMeetRealtimeTranscript(transcript, role, text); params.logger.info(`[google-meet] realtime ${role}: ${text}`); if (role === "user" && strategy === "agent") { + if (isGoogleMeetLikelyAssistantEchoTranscript({ transcript, text })) { + params.logger.info(`[google-meet] realtime ignored assistant echo transcript: ${text}`); + return; + } enqueueAgentConsultForUserTranscript(text); } }