diff --git a/CHANGELOG.md b/CHANGELOG.md index 56b729e615c..2e9c057a29d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Google Meet: interrupt Realtime provider output when local barge-in clears playback, so command-pair audio stops model speech instead of only restarting Chrome playback. Fixes #73850. (#73834) Thanks @shhtheonlyperson. - Voice Call/Twilio: honor stored pre-connect TwiML before realtime webhook shortcuts and reject DTMF sequences outside conversation mode, so Meet PIN entry cannot be skipped or silently dropped. Thanks @donkeykong91 and @PfanP. - Google Meet/Voice Call: play Twilio Meet DTMF before opening the realtime media stream and carry the intro as the initial Voice Call message, so the greeting is generated after Meet admits the phone participant instead of racing a live-call TwiML update. Thanks @donkeykong91 and @PfanP. - Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs, including IPv6 loopback and unique-local forms, before joins. Thanks @donkeykong91 and @PfanP. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 3f6a354abc6..8fb86a221a5 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -a69e6b650513c2a697ee51087928bf78f63ba998c7c60f8cca61dd65a0184fd0 config-baseline.json +13b715c3aac380161ec167bccfcfb902c3231a802a08ab7ca9ef760e0c11913a config-baseline.json 0a259216178a582c567d1fa48c5236bff4bbd27c3e6af838ffcd042459ffce3c config-baseline.core.json -92712871defa92eeda8161b516db85574681f2b70678b940508a808b987aeae2 config-baseline.channel.json -6005cf9f6e8c9f25ef97207b5eee29ae0e506cf910cdeca77fc9894ad1755b1f config-baseline.plugin.json +da8e055ebba0730498703d209f9e2cfaa1484a83f3240e611dcdd7280e22a525 config-baseline.channel.json +8d41287cd9cb696cf8a5e8810bd731b9eda4af9b0829c6dadae2da56e19dc644 config-baseline.plugin.json diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index ea124ee5ed8..35f5d267c44 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -924,6 +924,16 @@ Defaults: and writing audio in `chrome.audioFormat` - `chrome.audioOutputCommand`: SoX command reading audio in `chrome.audioFormat` and writing to CoreAudio `BlackHole 2ch` +- `chrome.bargeInInputCommand`: optional local microphone command that writes + signed 16-bit little-endian mono PCM for human barge-in detection while + assistant playback is active. This currently applies to the Gateway-hosted + `chrome` command-pair bridge. +- `chrome.bargeInRmsThreshold: 650`: RMS level that counts as a human + interruption on `chrome.bargeInInputCommand` +- `chrome.bargeInPeakThreshold: 2500`: peak level that counts as a human + interruption on `chrome.bargeInInputCommand` +- `chrome.bargeInCooldownMs: 900`: minimum delay between repeated human + interruption clears - `realtime.provider: "openai"` - `realtime.toolPolicy: "safe-read-only"` - `realtime.instructions`: brief spoken replies, with @@ -946,6 +956,24 @@ Optional overrides: chrome: { guestName: "OpenClaw Agent", waitForInCallMs: 30000, + bargeInInputCommand: [ + "sox", + "-q", + "-t", + "coreaudio", + "External Microphone", + "-r", + "24000", + "-c", + "1", + "-b", + "16", + "-e", + "signed-integer", + "-t", + "raw", + "-", + ], }, chromeNode: { node: "parallels-macos", @@ -1028,6 +1056,8 @@ a session ended. not send the intro/test phrase into the audio bridge. - `providerConnected` / `realtimeReady`: realtime voice bridge state - `lastInputAt` / `lastOutputAt`: last audio seen from or sent to the bridge +- `lastSuppressedInputAt` / `suppressedInputBytes`: loopback input ignored while + assistant playback is active ```json { @@ -1448,6 +1478,14 @@ For clean duplex audio, route Meet output and Meet microphone through separate virtual devices or a Loopback-style virtual device graph. A single shared BlackHole device can echo other participants back into the call. +With the command-pair Chrome bridge, `chrome.bargeInInputCommand` can listen to a +separate local microphone and clear assistant playback when the human starts +talking. This keeps human speech ahead of assistant output even when the shared +BlackHole loopback input is temporarily suppressed during assistant playback. +Like `chrome.audioInputCommand` and `chrome.audioOutputCommand`, it is an +operator-configured local command. Use an explicit trusted command path or +argument list, and do not point it at scripts from untrusted locations. + `googlemeet speak` triggers the active realtime audio bridge for a Chrome session. `googlemeet leave` stops that bridge. For Twilio sessions delegated through the Voice Call plugin, `leave` also hangs up the underlying voice call. diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index 67b76162da2..b68efa2e9d2 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -593,6 +593,7 @@ API key auth, and dynamic model resolution. connect: async () => {}, sendAudio: () => {}, setMediaTimestamp: () => {}, + handleBargeIn: () => {}, submitToolResult: () => {}, acknowledgeMark: () => {}, close: () => {}, @@ -600,6 +601,10 @@ API key auth, and dynamic model resolution. }), }); ``` + + Implement `handleBargeIn` when a transport can detect that a human is + interrupting assistant playback and the provider supports truncating or + clearing the active audio response. ```typescript diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index f69c7b4df72..ce71e091be0 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -357,6 +357,9 @@ describe("google-meet plugin", () => { "coreaudio", "BlackHole 2ch", ], + bargeInRmsThreshold: 650, + bargeInPeakThreshold: 2500, + bargeInCooldownMs: 900, }, voiceCall: { enabled: true, @@ -375,6 +378,48 @@ describe("google-meet plugin", () => { expect(resolveGoogleMeetConfig({}).realtime.instructions).toContain("openclaw_agent_consult"); }); + it("declares barge-in config metadata in the plugin entry and manifest", () => { + const manifest = JSON.parse( + readFileSync(new URL("./openclaw.plugin.json", import.meta.url), "utf8"), + ) as { + uiHints?: Record; + configSchema?: { + properties?: { + chrome?: { + properties?: Record; + }; + }; + }; + }; + const entry = plugin as unknown as { + configSchema: { + uiHints?: Record; + }; + }; + + expect(entry.configSchema.uiHints).toMatchObject({ + "chrome.bargeInInputCommand": expect.objectContaining({ advanced: true }), + "chrome.bargeInRmsThreshold": expect.objectContaining({ advanced: true }), + "chrome.bargeInPeakThreshold": expect.objectContaining({ advanced: true }), + "chrome.bargeInCooldownMs": expect.objectContaining({ advanced: true }), + }); + expect(manifest.uiHints).toMatchObject({ + "chrome.bargeInInputCommand": expect.objectContaining({ advanced: true }), + "chrome.bargeInRmsThreshold": expect.objectContaining({ advanced: true }), + "chrome.bargeInPeakThreshold": expect.objectContaining({ advanced: true }), + "chrome.bargeInCooldownMs": expect.objectContaining({ advanced: true }), + }); + expect(manifest.configSchema?.properties?.chrome?.properties).toMatchObject({ + bargeInInputCommand: expect.objectContaining({ + type: "array", + items: { type: "string" }, + }), + bargeInRmsThreshold: expect.objectContaining({ type: "number", default: 650 }), + bargeInPeakThreshold: expect.objectContaining({ type: "number", default: 2500 }), + bargeInCooldownMs: expect.objectContaining({ type: "number", default: 900 }), + }); + }); + it("resolves the realtime consult agent id", () => { expect( resolveGoogleMeetConfig({ @@ -1345,6 +1390,53 @@ describe("google-meet plugin", () => { } }); + it("checks a configured local barge-in command in setup status", async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, "platform", { value: "darwin" }); + try { + const { tools } = setup( + { + defaultTransport: "chrome", + chrome: { + bargeInInputCommand: ["missing-barge-capture"], + }, + }, + { + runCommandWithTimeoutHandler: async (argv) => { + if (argv[0] === "/usr/sbin/system_profiler") { + return { code: 0, stdout: "BlackHole 2ch", stderr: "" }; + } + if (argv[0] === "/bin/sh" && argv.at(-1) === "missing-barge-capture") { + return { code: 1, stdout: "", stderr: "" }; + } + return { code: 0, stdout: "", stderr: "" }; + }, + }, + ); + const tool = tools[0] as { + execute: ( + id: string, + params: unknown, + ) => Promise<{ details: { ok?: boolean; checks?: unknown[] } }>; + }; + + const result = await tool.execute("id", { action: "setup_status", transport: "chrome" }); + + expect(result.details.ok).toBe(false); + expect(result.details.checks).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + id: "chrome-local-audio-commands", + ok: false, + message: "Chrome audio command missing: missing-barge-capture", + }), + ]), + ); + } finally { + Object.defineProperty(process, "platform", { value: originalPlatform }); + } + }); + it("skips local Chrome audio prerequisites for observe-only setup status", async () => { const originalPlatform = process.platform; Object.defineProperty(process, "platform", { value: "darwin" }); @@ -2398,6 +2490,7 @@ describe("google-meet plugin", () => { connect: vi.fn(async () => {}), sendAudio, setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), submitToolResult: vi.fn(), acknowledgeMark: vi.fn(), close: vi.fn(), @@ -2517,7 +2610,7 @@ describe("google-meet plugin", () => { }); expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2, 3])); expect(outputStdinWrites).toEqual([Buffer.from([4, 5])]); - expect(outputProcess.kill).toHaveBeenCalledWith("SIGTERM"); + expect(outputProcess.kill).toHaveBeenCalledWith("SIGKILL"); expect(replacementOutputStdinWrites).toEqual([Buffer.from([6, 7])]); outputProcess.emit("error", new Error("stale output process failed after clear")); expect(bridge.close).not.toHaveBeenCalled(); @@ -2570,7 +2663,114 @@ describe("google-meet plugin", () => { await handle.stop(); expect(bridge.close).toHaveBeenCalled(); expect(inputProcess.kill).toHaveBeenCalledWith("SIGTERM"); - expect(outputProcess.kill).toHaveBeenCalledWith("SIGTERM"); + expect(replacementOutputProcess.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("uses a local barge-in input command to clear active Chrome playback", async () => { + let callbacks: + | { + onAudio: (audio: Buffer) => void; + } + | undefined; + const sendAudio = vi.fn(); + const bridge = { + connect: vi.fn(async () => {}), + sendAudio, + setMediaTimestamp: vi.fn(), + handleBargeIn: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider: RealtimeVoiceProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 1, + resolveConfig: ({ rawConfig }) => rawConfig, + isConfigured: () => true, + createBridge: (req) => { + callbacks = req; + return bridge; + }, + }; + const inputStdout = new PassThrough(); + const bargeInStdout = new PassThrough(); + const outputStdin = new Writable({ + write(_chunk, _encoding, done) { + done(); + }, + }); + const replacementOutputStdin = new Writable({ + write(_chunk, _encoding, done) { + done(); + }, + }); + const makeProcess = (stdio: { + stdin?: { write(chunk: unknown): unknown } | null; + stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null; + }): TestBridgeProcess => { + const proc = new EventEmitter() as unknown as TestBridgeProcess; + proc.stdin = stdio.stdin; + proc.stdout = stdio.stdout; + proc.stderr = new PassThrough(); + proc.killed = false; + proc.kill = vi.fn(() => { + proc.killed = true; + return true; + }); + return proc; + }; + const outputProcess = makeProcess({ stdin: outputStdin, stdout: null }); + const inputProcess = makeProcess({ stdout: inputStdout, stdin: null }); + const bargeInProcess = makeProcess({ stdout: bargeInStdout, stdin: null }); + const replacementOutputProcess = makeProcess({ stdin: replacementOutputStdin, stdout: null }); + const spawnMock = vi + .fn() + .mockReturnValueOnce(outputProcess) + .mockReturnValueOnce(inputProcess) + .mockReturnValueOnce(bargeInProcess) + .mockReturnValueOnce(replacementOutputProcess); + + const handle = await startCommandRealtimeAudioBridge({ + config: resolveGoogleMeetConfig({ + chrome: { + bargeInInputCommand: ["capture-human"], + bargeInRmsThreshold: 10, + bargeInPeakThreshold: 10, + bargeInCooldownMs: 1, + }, + realtime: { provider: "openai", model: "gpt-realtime" }, + }), + fullConfig: {} as never, + runtime: {} as never, + meetingSessionId: "meet-1", + inputCommand: ["capture-meet"], + outputCommand: ["play-meet"], + logger: noopLogger, + providers: [provider], + spawn: spawnMock, + }); + + callbacks?.onAudio(Buffer.alloc(48_000)); + inputStdout.write(Buffer.from([1, 2, 3, 4])); + bargeInStdout.write(Buffer.from([0xff, 0x7f, 0xff, 0x7f])); + + expect(spawnMock).toHaveBeenNthCalledWith(3, "capture-human", [], { + stdio: ["ignore", "pipe", "pipe"], + }); + expect(bridge.handleBargeIn).toHaveBeenCalled(); + expect(outputProcess.kill).toHaveBeenCalledWith("SIGKILL"); + expect(sendAudio).not.toHaveBeenCalledWith(Buffer.from([1, 2, 3, 4])); + expect(handle.getHealth()).toMatchObject({ + clearCount: 1, + suppressedInputBytes: 4, + }); + + await handle.stop(); + expect(inputProcess.kill).toHaveBeenCalledWith("SIGTERM"); + expect(bargeInProcess.kill).toHaveBeenCalledWith("SIGTERM"); + expect(replacementOutputProcess.kill).toHaveBeenCalledWith("SIGTERM"); }); it("pipes paired-node command-pair audio through the realtime provider", async () => { diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 717c320b99e..29a61185c74 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -91,6 +91,26 @@ const googleMeetConfigSchema = { help: "Command that reads assistant audio from stdin in chrome.audioFormat.", advanced: true, }, + "chrome.bargeInInputCommand": { + label: "Barge-In Input Command", + help: "Optional Gateway-hosted microphone command that writes signed 16-bit little-endian mono PCM for human interruption detection while assistant playback is active.", + advanced: true, + }, + "chrome.bargeInRmsThreshold": { + label: "Barge-In RMS Threshold", + help: "RMS level on chrome.bargeInInputCommand that counts as a human interruption.", + advanced: true, + }, + "chrome.bargeInPeakThreshold": { + label: "Barge-In Peak Threshold", + help: "Peak level on chrome.bargeInInputCommand that counts as a human interruption.", + advanced: true, + }, + "chrome.bargeInCooldownMs": { + label: "Barge-In Cooldown (ms)", + help: "Minimum delay between repeated barge-in clears.", + advanced: true, + }, "chrome.audioBridgeCommand": { label: "Audio Bridge Command", advanced: true }, "chrome.audioBridgeHealthCommand": { label: "Audio Bridge Health Command", diff --git a/extensions/google-meet/openclaw.plugin.json b/extensions/google-meet/openclaw.plugin.json index bf048fb811f..67fcf9a1a58 100644 --- a/extensions/google-meet/openclaw.plugin.json +++ b/extensions/google-meet/openclaw.plugin.json @@ -65,6 +65,26 @@ "help": "Command that reads assistant audio from stdin in chrome.audioFormat.", "advanced": true }, + "chrome.bargeInInputCommand": { + "label": "Barge-In Input Command", + "help": "Optional Gateway-hosted microphone command that writes signed 16-bit little-endian mono PCM for human interruption detection while assistant playback is active.", + "advanced": true + }, + "chrome.bargeInRmsThreshold": { + "label": "Barge-In RMS Threshold", + "help": "RMS level on chrome.bargeInInputCommand that counts as a human interruption.", + "advanced": true + }, + "chrome.bargeInPeakThreshold": { + "label": "Barge-In Peak Threshold", + "help": "Peak level on chrome.bargeInInputCommand that counts as a human interruption.", + "advanced": true + }, + "chrome.bargeInCooldownMs": { + "label": "Barge-In Cooldown (ms)", + "help": "Minimum delay between repeated barge-in clears.", + "advanced": true + }, "chrome.audioFormat": { "label": "Audio Format", "help": "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.", @@ -294,6 +314,24 @@ "type": "string" } }, + "bargeInInputCommand": { + "type": "array", + "items": { + "type": "string" + } + }, + "bargeInRmsThreshold": { + "type": "number", + "default": 650 + }, + "bargeInPeakThreshold": { + "type": "number", + "default": 2500 + }, + "bargeInCooldownMs": { + "type": "number", + "default": 900 + }, "audioBridgeCommand": { "type": "array", "items": { diff --git a/extensions/google-meet/src/config.ts b/extensions/google-meet/src/config.ts index f75bdf084d7..81579efc3ff 100644 --- a/extensions/google-meet/src/config.ts +++ b/extensions/google-meet/src/config.ts @@ -35,6 +35,10 @@ export type GoogleMeetConfig = { waitForInCallMs: number; audioInputCommand?: string[]; audioOutputCommand?: string[]; + bargeInInputCommand?: string[]; + bargeInRmsThreshold: number; + bargeInPeakThreshold: number; + bargeInCooldownMs: number; audioBridgeCommand?: string[]; audioBridgeHealthCommand?: string[]; }; @@ -152,6 +156,9 @@ export const LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ ] as const; export const DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT: GoogleMeetChromeAudioFormat = "pcm16-24khz"; +export const DEFAULT_GOOGLE_MEET_BARGE_IN_RMS_THRESHOLD = 650; +export const DEFAULT_GOOGLE_MEET_BARGE_IN_PEAK_THRESHOLD = 2500; +export const DEFAULT_GOOGLE_MEET_BARGE_IN_COOLDOWN_MS = 900; export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`; export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening."; @@ -175,6 +182,9 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = { waitForInCallMs: 20_000, audioInputCommand: [...DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND], audioOutputCommand: [...DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND], + bargeInRmsThreshold: DEFAULT_GOOGLE_MEET_BARGE_IN_RMS_THRESHOLD, + bargeInPeakThreshold: DEFAULT_GOOGLE_MEET_BARGE_IN_PEAK_THRESHOLD, + bargeInCooldownMs: DEFAULT_GOOGLE_MEET_BARGE_IN_COOLDOWN_MS, }, chromeNode: {}, twilio: {}, @@ -411,6 +421,19 @@ export function resolveGoogleMeetConfigWithEnv( audioOutputCommand: configuredAudioOutputCommand ?? [ ...defaultAudioOutputCommand(audioFormat), ], + bargeInInputCommand: resolveStringArray(chrome.bargeInInputCommand), + bargeInRmsThreshold: resolveNumber( + chrome.bargeInRmsThreshold, + DEFAULT_GOOGLE_MEET_CONFIG.chrome.bargeInRmsThreshold, + ), + bargeInPeakThreshold: resolveNumber( + chrome.bargeInPeakThreshold, + DEFAULT_GOOGLE_MEET_CONFIG.chrome.bargeInPeakThreshold, + ), + bargeInCooldownMs: resolveNumber( + chrome.bargeInCooldownMs, + DEFAULT_GOOGLE_MEET_CONFIG.chrome.bargeInCooldownMs, + ), audioBridgeCommand: resolveStringArray(chrome.audioBridgeCommand), audioBridgeHealthCommand: resolveStringArray(chrome.audioBridgeHealthCommand), }, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 4873dc849c8..f059528552e 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -63,6 +63,23 @@ function splitCommand(argv: string[]): { command: string; args: string[] } { return { command, args }; } +function readPcm16Stats(audio: Buffer): { rms: number; peak: number } { + let sumSquares = 0; + let peak = 0; + let samples = 0; + for (let offset = 0; offset + 1 < audio.byteLength; offset += 2) { + const sample = audio.readInt16LE(offset); + const abs = Math.abs(sample); + peak = Math.max(peak, abs); + sumSquares += sample * sample; + samples += 1; + } + return { + rms: samples > 0 ? Math.sqrt(sumSquares / samples) : 0, + peak, + }; +} + export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) { return config.chrome.audioFormat === "g711-ulaw-8khz" ? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ @@ -117,6 +134,48 @@ export async function startCommandRealtimeAudioBridge(params: { let lastOutputBytes = 0; let lastClearAt: string | undefined; let clearCount = 0; + let suppressedInputBytes = 0; + let lastSuppressedInputAt: string | undefined; + let suppressInputUntil = 0; + let lastOutputAtMs = 0; + let lastOutputPlayableUntilMs = 0; + let bargeInInputProcess: BridgeProcess | undefined; + + const suppressInputForOutput = (audio: Buffer) => { + const bytesPerMs = params.config.chrome.audioFormat === "g711-ulaw-8khz" ? 8 : 48; + const durationMs = Math.ceil(audio.byteLength / bytesPerMs); + const until = Date.now() + durationMs + 900; + suppressInputUntil = Math.max(suppressInputUntil, until); + lastOutputPlayableUntilMs = Math.max(lastOutputPlayableUntilMs, until); + }; + + const terminateProcess = (proc: BridgeProcess, signal: NodeJS.Signals = "SIGTERM") => { + if (proc.killed && signal !== "SIGKILL") { + return; + } + let exited = false; + proc.on("exit", () => { + exited = true; + }); + try { + proc.kill(signal); + } catch { + return; + } + if (signal === "SIGKILL") { + return; + } + const timer = setTimeout(() => { + if (!exited) { + try { + proc.kill("SIGKILL"); + } catch { + // Process may have exited after the grace check. + } + } + }, 1000); + timer.unref?.(); + }; const stop = async () => { if (stopped) { @@ -130,8 +189,11 @@ export async function startCommandRealtimeAudioBridge(params: { `[google-meet] realtime voice bridge close ignored: ${formatErrorMessage(error)}`, ); } - inputProcess.kill("SIGTERM"); - outputProcess.kill("SIGTERM"); + terminateProcess(inputProcess); + terminateProcess(outputProcess); + if (bargeInInputProcess) { + terminateProcess(bargeInInputProcess); + } }; const fail = (label: string) => (error: Error) => { @@ -169,10 +231,69 @@ export async function startCommandRealtimeAudioBridge(params: { attachOutputProcessHandlers(outputProcess); clearCount += 1; lastClearAt = new Date().toISOString(); + suppressInputUntil = 0; + lastOutputPlayableUntilMs = 0; params.logger.debug?.( `[google-meet] cleared realtime audio output buffer by restarting playback command`, ); - previousOutput.kill("SIGTERM"); + terminateProcess(previousOutput, "SIGKILL"); + }; + const startHumanBargeInMonitor = () => { + const commandArgv = params.config.chrome.bargeInInputCommand; + if (!commandArgv) { + return; + } + const command = splitCommand(commandArgv); + let lastBargeInAt = 0; + bargeInInputProcess = spawnFn(command.command, command.args, { + stdio: ["ignore", "pipe", "pipe"], + }); + bargeInInputProcess.stdout?.on("data", (chunk) => { + if (stopped || lastOutputAtMs === 0) { + return; + } + const now = Date.now(); + const playbackActive = now <= Math.max(lastOutputPlayableUntilMs, suppressInputUntil); + if (!playbackActive && now - lastOutputAtMs > 1000) { + return; + } + if (now - lastBargeInAt < params.config.chrome.bargeInCooldownMs) { + return; + } + const audio = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk); + const stats = readPcm16Stats(audio); + if ( + stats.rms < params.config.chrome.bargeInRmsThreshold && + stats.peak < params.config.chrome.bargeInPeakThreshold + ) { + return; + } + lastBargeInAt = now; + suppressInputUntil = 0; + const beforeClearCount = clearCount; + bridge?.handleBargeIn({ audioPlaybackActive: true }); + if (beforeClearCount === clearCount) { + clearOutputPlayback(); + } + params.logger.debug?.( + `[google-meet] human barge-in detected by local input (rms=${Math.round( + stats.rms, + )}, peak=${stats.peak})`, + ); + }); + bargeInInputProcess.stderr?.on("data", (chunk) => { + params.logger.debug?.(`[google-meet] barge-in input: ${String(chunk).trim()}`); + }); + bargeInInputProcess.on("error", (error) => { + params.logger.warn(`[google-meet] human barge-in input failed: ${formatErrorMessage(error)}`); + }); + bargeInInputProcess.on("exit", (code, signal) => { + if (!stopped) { + params.logger.debug?.( + `[google-meet] human barge-in input exited (${code ?? signal ?? "done"})`, + ); + } + }); }; inputProcess.on("error", fail("audio input command")); inputProcess.on("exit", (code, signal) => { @@ -204,8 +325,10 @@ export async function startCommandRealtimeAudioBridge(params: { audioSink: { isOpen: () => !stopped, sendAudio: (audio) => { + lastOutputAtMs = Date.now(); lastOutputAt = new Date().toISOString(); lastOutputBytes += audio.byteLength; + suppressInputForOutput(audio); outputProcess.stdin?.write(audio); }, clearAudio: clearOutputPlayback, @@ -256,10 +379,16 @@ export async function startCommandRealtimeAudioBridge(params: { realtimeReady = true; }, }); + startHumanBargeInMonitor(); inputProcess.stdout?.on("data", (chunk) => { const audio = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk); if (!stopped && audio.byteLength > 0) { + if (Date.now() < suppressInputUntil) { + lastSuppressedInputAt = new Date().toISOString(); + suppressedInputBytes += audio.byteLength; + return; + } lastInputAt = new Date().toISOString(); lastInputBytes += audio.byteLength; bridge?.sendAudio(Buffer.from(audio)); @@ -281,8 +410,10 @@ export async function startCommandRealtimeAudioBridge(params: { audioOutputActive: lastOutputBytes > 0, lastInputAt, lastOutputAt, + lastSuppressedInputAt, lastInputBytes, lastOutputBytes, + suppressedInputBytes, lastClearAt, clearCount, bridgeClosed: stopped, diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index 3282d4acfff..0ca69d601e4 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -133,7 +133,11 @@ function evaluateSpeechReadiness(session: GoogleMeetSession): { function collectChromeAudioCommands(config: GoogleMeetConfig): string[] { const commands = config.chrome.audioBridgeCommand ? [config.chrome.audioBridgeCommand[0]] - : [config.chrome.audioInputCommand?.[0], config.chrome.audioOutputCommand?.[0]]; + : [ + config.chrome.audioInputCommand?.[0], + config.chrome.audioOutputCommand?.[0], + config.chrome.bargeInInputCommand?.[0], + ]; return [...new Set(commands.filter((value): value is string => Boolean(value?.trim())))]; } diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 9c523bf68bf..df2ea75bd5a 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -40,9 +40,11 @@ export type GoogleMeetChromeHealth = { audioOutputActive?: boolean; lastInputAt?: string; lastOutputAt?: string; + lastSuppressedInputAt?: string; lastClearAt?: string; lastInputBytes?: number; lastOutputBytes?: number; + suppressedInputBytes?: number; consecutiveInputErrors?: number; lastInputError?: string; clearCount?: number; diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 9991dd23ea4..c9e4e0cfb15 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -66,6 +66,9 @@ type FakeWebSocketInstance = InstanceType; type SentRealtimeEvent = { type: string; audio?: string; + item_id?: string; + content_index?: number; + audio_end_ms?: number; session?: { input_audio_format?: string; output_audio_format?: string; @@ -279,4 +282,56 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(socket.terminated).toBe(false); expect(onClose).toHaveBeenCalledWith("completed"); }); + + it("truncates externally interrupted playback after an immediate mark acknowledgement", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const onClearAudio = vi.fn(); + let bridge: ReturnType; + bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + onAudio, + onClearAudio, + onMark: () => bridge.acknowledgeMark(), + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + await connecting; + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + + bridge.setMediaTimestamp(1000); + socket.emit( + "message", + Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.audio.delta", + item_id: "item_1", + delta: Buffer.from("assistant audio").toString("base64"), + }), + ), + ); + bridge.setMediaTimestamp(1240); + + bridge.handleBargeIn?.({ audioPlaybackActive: true }); + + expect(onAudio).toHaveBeenCalledTimes(1); + expect(onClearAudio).toHaveBeenCalledTimes(1); + expect(parseSent(socket)).toContainEqual({ type: "response.cancel" }); + expect(parseSent(socket)).toContainEqual({ + type: "conversation.item.truncate", + item_id: "item_1", + content_index: 0, + audio_end_ms: 240, + }); + }); }); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 0a7732907c6..a7879ba25c0 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -10,6 +10,7 @@ import { } from "openclaw/plugin-sdk/proxy-capture"; import type { RealtimeVoiceAudioFormat, + RealtimeVoiceBargeInOptions, RealtimeVoiceBridge, RealtimeVoiceBrowserSession, RealtimeVoiceBrowserSessionCreateRequest, @@ -77,6 +78,10 @@ type RealtimeEvent = { item_id?: string; call_id?: string; name?: string; + response?: { + id?: string; + status?: string; + }; error?: unknown; }; @@ -141,6 +146,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { private pendingAudio: Buffer[] = []; private markQueue: string[] = []; private responseStartTimestamp: number | null = null; + private responseActive = false; private latestMediaTimestamp = 0; private lastAssistantItemId: string | null = null; private toolCallBuffers = new Map(); @@ -216,10 +222,6 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { return; } this.markQueue.shift(); - if (this.markQueue.length === 0) { - this.responseStartTimestamp = null; - this.lastAssistantItemId = null; - } } close(): void { @@ -483,18 +485,23 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; + case "response.created": + this.responseActive = true; + return; + case "response.audio.delta": { if (!event.delta) { return; } const audio = base64ToBuffer(event.delta); this.config.onAudio(audio); - if (this.responseStartTimestamp === null) { + if (event.item_id && event.item_id !== this.lastAssistantItemId) { + this.lastAssistantItemId = event.item_id; + this.responseStartTimestamp = this.latestMediaTimestamp; + } else if (this.responseStartTimestamp === null) { this.responseStartTimestamp = this.latestMediaTimestamp; } - if (event.item_id) { - this.lastAssistantItemId = event.item_id; - } + this.responseActive = true; this.sendMark(); return; } @@ -527,6 +534,10 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; + case "response.done": + this.responseActive = false; + return; + case "response.function_call_arguments.delta": { const key = event.item_id ?? "unknown"; const existing = this.toolCallBuffers.get(key); @@ -576,21 +587,29 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } } - private handleBargeIn(): void { - if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) { - const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp; - if (this.lastAssistantItemId) { - this.sendEvent({ - type: "conversation.item.truncate", - item_id: this.lastAssistantItemId, - content_index: 0, - audio_end_ms: Math.max(0, elapsedMs), - }); - } + handleBargeIn(options?: RealtimeVoiceBargeInOptions): void { + const assistantItemId = this.lastAssistantItemId; + const responseStartTimestamp = this.responseStartTimestamp; + const shouldInterruptProvider = + responseStartTimestamp !== null && + assistantItemId !== null && + (this.markQueue.length > 0 || options?.audioPlaybackActive === true); + if (options?.audioPlaybackActive === true && this.responseActive) { + this.sendEvent({ type: "response.cancel" }); + } + if (shouldInterruptProvider) { + const elapsedMs = this.latestMediaTimestamp - responseStartTimestamp; + this.sendEvent({ + type: "conversation.item.truncate", + item_id: assistantItemId, + content_index: 0, + audio_end_ms: Math.max(0, elapsedMs), + }); this.config.onClearAudio(); this.markQueue = []; this.lastAssistantItemId = null; this.responseStartTimestamp = null; + this.responseActive = false; return; } this.config.onClearAudio(); diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index 9966c347784..ceb63d0283c 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -1,6 +1,7 @@ export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; export type { RealtimeVoiceAudioFormat, + RealtimeVoiceBargeInOptions, RealtimeVoiceBridge, RealtimeVoiceBridgeCallbacks, RealtimeVoiceBrowserSession, diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index 57e85d68c46..51fd6a1b20c 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -154,8 +154,17 @@ export type RealtimeVoiceBridge = { setMediaTimestamp(ts: number): void; sendUserMessage?(text: string): void; triggerGreeting?(instructions?: string): void; + handleBargeIn?(options?: RealtimeVoiceBargeInOptions): void; submitToolResult(callId: string, result: unknown, options?: RealtimeVoiceToolResultOptions): void; acknowledgeMark(): void; close(): void; isConnected(): boolean; }; + +export type RealtimeVoiceBargeInOptions = { + /** + * The caller has already confirmed assistant audio is still playing in its output sink. + * This lets providers interrupt output even when the sink cannot provide real playback marks. + */ + audioPlaybackActive?: boolean; +}; diff --git a/src/realtime-voice/session-runtime.ts b/src/realtime-voice/session-runtime.ts index 1f188d30996..44535f5cd82 100644 --- a/src/realtime-voice/session-runtime.ts +++ b/src/realtime-voice/session-runtime.ts @@ -2,6 +2,7 @@ import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; import type { RealtimeVoiceBridge, RealtimeVoiceAudioFormat, + RealtimeVoiceBargeInOptions, RealtimeVoiceCloseReason, RealtimeVoiceProviderConfig, RealtimeVoiceRole, @@ -26,6 +27,7 @@ export type RealtimeVoiceBridgeSession = { connect(): Promise; sendAudio(audio: Buffer): void; sendUserMessage(text: string): void; + handleBargeIn(options?: RealtimeVoiceBargeInOptions): void; setMediaTimestamp(ts: number): void; submitToolResult(callId: string, result: unknown, options?: RealtimeVoiceToolResultOptions): void; triggerGreeting(instructions?: string): void; @@ -67,6 +69,7 @@ export function createRealtimeVoiceBridgeSession( connect: () => requireBridge().connect(), sendAudio: (audio) => requireBridge().sendAudio(audio), sendUserMessage: (text) => requireBridge().sendUserMessage?.(text), + handleBargeIn: (options) => requireBridge().handleBargeIn?.(options), setMediaTimestamp: (ts) => requireBridge().setMediaTimestamp(ts), submitToolResult: (callId, result, options) => requireBridge().submitToolResult(callId, result, options),