From b13e9f186491ab8954a62cfd098c370f5444aa8a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 4 May 2026 01:23:24 +0100 Subject: [PATCH] fix: stabilize Google Meet realtime talkback --- CHANGELOG.md | 1 + docs/plugins/google-meet.md | 41 ++++- extensions/google-meet/index.test.ts | 148 ++++++++++++++++-- extensions/google-meet/index.ts | 4 + extensions/google-meet/openclaw.plugin.json | 11 +- extensions/google-meet/src/agent-consult.ts | 4 +- extensions/google-meet/src/cli.ts | 8 + extensions/google-meet/src/config.ts | 17 +- extensions/google-meet/src/node-host.ts | 12 +- extensions/google-meet/src/realtime-node.ts | 107 ++++++++++++- extensions/google-meet/src/realtime.ts | 132 +++++++++++++++- extensions/google-meet/src/runtime.ts | 1 + .../google-meet/src/transports/chrome.ts | 63 +++++++- .../google-meet/src/transports/types.ts | 4 + .../openai/realtime-voice-provider.test.ts | 83 +++++++++- extensions/openai/realtime-voice-provider.ts | 13 +- src/realtime-voice/agent-consult-runtime.ts | 2 + src/realtime-voice/provider-types.ts | 1 + src/realtime-voice/session-runtime.test.ts | 22 +++ src/realtime-voice/session-runtime.ts | 2 + 20 files changed, 633 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc2b4dff71..5c5cd5ec076 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai - QA/cache: require the full `CACHE-OK ` marker before live cache probes stop retrying, so suffix-only prose cannot hide a broken probe response. Thanks @vincentkoc. - Slack/Matrix: avoid creating blank progress-draft messages when `streaming.progress.label=false` and progress tool lines are disabled. Thanks @vincentkoc. - QA/Matrix: keep the mock OpenAI tool-progress provider aligned with exact-marker Matrix prompts so the hardened live preview scenario still forces a deterministic read before final delivery. Thanks @vincentkoc. +- Google Meet: make realtime talk-back agent-driven by default with `realtime.strategy: "agent"`, keep the previous direct bidirectional model behavior available as `realtime.strategy: "bidi"`, route the Meet tab speaker output to `BlackHole 2ch` automatically for local Chrome realtime joins, coalesce nearby speech transcript fragments before consulting the agent, and avoid cutting off agent speech from server VAD or stale playback pipe errors. - OpenAI/Google Meet: wait for realtime voice `session.updated` before treating the bridge as connected, so Meet joins do not return with audio queued behind an unconfigured realtime session. Thanks @vincentkoc. - Plugins/catalog: merge official external catalog descriptors into partial package channel config metadata, so lagging WeCom/Yuanbao manifests keep their own schema while still exposing host-supplied labels and setup text. Thanks @vincentkoc. - Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 189edde6824..4433f1f0ef5 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -190,7 +190,7 @@ then share the returned `meetingUri`. ``` For an observe-only/browser-control join, set `"mode": "transcribe"`. That does -not start the duplex realtime model bridge, does not require BlackHole or SoX, +not start the duplex realtime voice bridge, does not require BlackHole or SoX, and will not talk back into the meeting. Chrome joins in this mode also avoid OpenClaw's microphone/camera permission grant and avoid the Meet **Use microphone** path. If Meet shows an audio-choice interstitial, automation tries @@ -1027,6 +1027,12 @@ Defaults: interruption on `chrome.bargeInInputCommand` - `chrome.bargeInCooldownMs: 900`: minimum delay between repeated human interruption clears +- `realtime.strategy: "agent"`: default. Participant speech is transcribed, + sent to the configured OpenClaw agent in a per-meeting sub-agent session, and + the returned answer is spoken back through the realtime provider. +- `realtime.strategy: "bidi"`: direct bidirectional realtime model mode. The + realtime provider answers participant speech directly and may call + `openclaw_agent_consult` for deeper/tool-backed answers. - `realtime.provider: "openai"` - `realtime.toolPolicy: "safe-read-only"` - `realtime.instructions`: brief spoken replies, with @@ -1072,6 +1078,7 @@ Optional overrides: node: "parallels-macos", }, realtime: { + strategy: "agent", provider: "google", agentId: "jay", toolPolicy: "owner", @@ -1124,7 +1131,10 @@ Agents can use the `google_meet` tool: Use `transport: "chrome"` when Chrome runs on the Gateway host. Use `transport: "chrome-node"` when Chrome runs on a paired node such as a Parallels VM. In both cases the realtime model and `openclaw_agent_consult` run on the -Gateway host, so model credentials stay there. +Gateway host, so model credentials stay there. With the default +`realtime.strategy: "agent"`, the realtime provider handles audio and +transcription while the configured OpenClaw agent produces the spoken answer. +With `realtime.strategy: "bidi"`, the realtime model answers directly. Use `action: "status"` to list active sessions or inspect a session ID. Use `action: "speak"` with `sessionId` and `message` to make the realtime agent @@ -1149,6 +1159,8 @@ a session ended. not send the intro/test phrase into the audio bridge. - `providerConnected` / `realtimeReady`: realtime voice bridge state - `lastInputAt` / `lastOutputAt`: last audio seen from or sent to the bridge +- `audioOutputRouted` / `audioOutputDeviceLabel`: whether the Meet tab's media + output was actively routed to the BlackHole device used by the bridge - `lastSuppressedInputAt` / `suppressedInputBytes`: loopback input ignored while assistant playback is active @@ -1164,8 +1176,20 @@ a session ended. Chrome realtime mode is optimized for a live voice loop. The realtime voice provider hears the meeting audio and speaks through the configured audio bridge. -When the realtime model needs deeper reasoning, current information, or normal -OpenClaw tools, it can call `openclaw_agent_consult`. +The default `realtime.strategy: "agent"` uses the realtime provider for audio +I/O and transcription, but routes final participant transcripts through the +configured OpenClaw agent before speaking. Set `realtime.strategy: "bidi"` when +you want the realtime model to answer directly. +Nearby final transcript fragments are coalesced before the consult so one spoken +turn does not produce several stale partial answers. + +| Strategy | Who decides the answer | Context behavior | Use when | +| -------- | ----------------------------- | ------------------------------------------------------------------------------------ | ----------------------------------------------------- | +| `agent` | The configured OpenClaw agent | Per-meeting sub-agent session plus normal agent policy, tools, workspace, and memory | You want "my agent is in the meeting" behavior | +| `bidi` | The realtime voice model | Realtime session context, with optional `openclaw_agent_consult` calls | You want the lowest-latency conversational voice loop | + +In `bidi` strategy, when the realtime model needs deeper reasoning, current +information, or normal OpenClaw tools, it can call `openclaw_agent_consult`. The consult tool runs the regular OpenClaw agent behind the scenes with recent meeting transcript context and returns a concise spoken answer to the realtime @@ -1176,6 +1200,10 @@ By default, consults run against the `main` agent. Set `realtime.agentId` when a Meet lane should consult a dedicated OpenClaw agent workspace, model defaults, tool policy, memory, and session history. +Agent strategy consults use a per-meeting `agent::subagent:google-meet:` +session key so follow-up questions keep meeting context while inheriting normal +agent policy from the configured agent. + `realtime.toolPolicy` controls the consult run: - `safe-read-only`: expose the consult tool and limit the regular agent to @@ -1414,7 +1442,8 @@ Also verify: - `BlackHole 2ch` is visible on the Chrome host. - `sox` exists on the Chrome host. - Meet microphone and speaker are routed through the virtual audio path used by - OpenClaw. + OpenClaw. `doctor` should show `meet output routed: yes` for local Chrome + realtime joins. `googlemeet doctor [session-id]` prints the session, node, in-call state, manual action reason, realtime provider connection, `realtimeReady`, audio @@ -1578,7 +1607,7 @@ phone dial-in participation. Chrome realtime mode needs `BlackHole 2ch` plus either: - `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the - realtime model bridge and pipes audio in `chrome.audioFormat` between those + realtime voice bridge and pipes audio in `chrome.audioFormat` between those commands and the selected realtime voice provider. The default Chrome path is 24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs. - `chrome.audioBridgeCommand`: an external bridge command owns the whole local diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index 48aa286180e..60c0810e5b9 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -371,6 +371,7 @@ describe("google-meet plugin", () => { postDtmfSpeechDelayMs: 5000, }, realtime: { + strategy: "agent", provider: "openai", introMessage: "Say exactly: I'm here and listening.", toolPolicy: "safe-read-only", @@ -2253,7 +2254,7 @@ describe("google-meet plugin", () => { ); }); - it("retries caption enable until the captions button is available", () => { + it("retries caption enable until the captions button is available", async () => { const makeButton = (label: string) => ({ disabled: false, innerText: "", @@ -2302,23 +2303,23 @@ describe("google-meet plugin", () => { captureCaptions: true, guestName: "OpenClaw Agent", })})`, - ).runInContext(context) as () => string; + ).runInContext(context) as () => string | Promise; - const first = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean }; + const first = JSON.parse(await inspect()) as { captionsEnabledAttempted?: boolean }; const stateAfterFirst = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean }; expect(first.captionsEnabledAttempted).toBe(false); expect(stateAfterFirst.enabledAttempted).toBe(false); expect(captionButton.click).not.toHaveBeenCalled(); page.buttons = [leaveButton, captionButton]; - const second = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean }; + const second = JSON.parse(await inspect()) as { captionsEnabledAttempted?: boolean }; const stateAfterSecond = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean }; expect(second.captionsEnabledAttempted).toBe(true); expect(stateAfterSecond.enabledAttempted).toBe(true); expect(captionButton.click).toHaveBeenCalledTimes(1); }); - it("reports in-call Meet audio permission problems from button labels", () => { + it("reports in-call Meet audio permission problems from button labels", async () => { const makeButton = (label: string) => ({ disabled: false, innerText: "", @@ -2361,9 +2362,9 @@ describe("google-meet plugin", () => { captureCaptions: false, guestName: "OpenClaw Agent", })})`, - ).runInContext(context) as () => string; + ).runInContext(context) as () => string | Promise; - const result = JSON.parse(inspect()) as { + const result = JSON.parse(await inspect()) as { inCall?: boolean; manualActionRequired?: boolean; manualActionReason?: string; @@ -2376,7 +2377,7 @@ describe("google-meet plugin", () => { expect(result.manualActionMessage).toContain("Allow microphone/camera/speaker permissions"); }); - it("uses the local Meet microphone control instead of remote participant mute buttons", () => { + it("uses the local Meet microphone control instead of remote participant mute buttons", async () => { const makeButton = (label: string, disabled = false) => ({ disabled, innerText: "", @@ -2416,9 +2417,9 @@ describe("google-meet plugin", () => { captureCaptions: false, guestName: "OpenClaw Agent", })})`, - ).runInContext(context) as () => string; + ).runInContext(context) as () => string | Promise; - const result = JSON.parse(inspect()) as { micMuted?: boolean; notes?: string[] }; + const result = JSON.parse(await inspect()) as { micMuted?: boolean; notes?: string[] }; expect(result.micMuted).toBe(true); expect(localMic.click).toHaveBeenCalledTimes(1); @@ -3526,7 +3527,7 @@ describe("google-meet plugin", () => { const handle = await startCommandRealtimeAudioBridge({ config: resolveGoogleMeetConfig({ - realtime: { provider: "openai", model: "gpt-realtime", agentId: "jay" }, + realtime: { strategy: "bidi", provider: "openai", model: "gpt-realtime", agentId: "jay" }, }), fullConfig: {} as never, runtime: runtime as never, @@ -3579,6 +3580,7 @@ describe("google-meet plugin", () => { expect(outputProcess.kill).toHaveBeenCalledWith("SIGKILL"); expect(replacementOutputStdinWrites).toEqual([Buffer.from([6, 7])]); outputProcess.emit("error", new Error("stale output process failed after clear")); + outputStdin.emit("error", new Error("stale output pipe closed after clear")); expect(bridge.close).not.toHaveBeenCalled(); expect(bridge.acknowledgeMark).toHaveBeenCalled(); expect(bridge.triggerGreeting).not.toHaveBeenCalled(); @@ -3616,6 +3618,7 @@ describe("google-meet plugin", () => { sampleRateHz: 24000, channels: 1, }, + autoRespondToAudio: true, tools: [ expect.objectContaining({ name: "openclaw_agent_consult", @@ -3635,13 +3638,14 @@ describe("google-meet plugin", () => { expect.objectContaining({ messageProvider: "google-meet", agentId: "jay", - sessionKey: "agent:jay:google-meet:meet-1", - sandboxSessionKey: "agent:jay:google-meet:meet-1", + spawnedBy: "agent:jay:main", + sessionKey: "agent:jay:subagent:google-meet:meet-1", + sandboxSessionKey: "agent:jay:subagent:google-meet:meet-1", thinkLevel: "high", toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"], }), ); - expect(sessionStore).toHaveProperty("agent:jay:google-meet:meet-1"); + expect(sessionStore).toHaveProperty("agent:jay:subagent:google-meet:meet-1"); await handle.stop(); expect(bridge.close).toHaveBeenCalled(); @@ -3649,6 +3653,119 @@ describe("google-meet plugin", () => { expect(replacementOutputProcess.kill).toHaveBeenCalledWith("SIGTERM"); }); + it("defaults Chrome command-pair realtime to agent-driven talk-back", async () => { + let callbacks: Parameters[0] | undefined; + const sendUserMessage = vi.fn(); + const bridge = { + connect: vi.fn(async () => {}), + sendAudio: vi.fn(), + sendUserMessage, + setMediaTimestamp: vi.fn(), + submitToolResult: vi.fn(), + acknowledgeMark: vi.fn(), + close: vi.fn(), + triggerGreeting: vi.fn(), + isConnected: vi.fn(() => true), + }; + const provider: RealtimeVoiceProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 1, + resolveConfig: ({ rawConfig }) => rawConfig, + isConfigured: () => true, + createBridge: (req) => { + callbacks = req; + return bridge; + }, + }; + const inputStdout = new PassThrough(); + const makeProcess = (stdio: { + stdin?: { write(chunk: unknown): unknown } | null; + stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null; + }): TestBridgeProcess => { + const proc = new EventEmitter() as unknown as TestBridgeProcess; + proc.stdin = stdio.stdin; + proc.stdout = stdio.stdout; + proc.stderr = new PassThrough(); + proc.killed = false; + proc.kill = vi.fn(() => { + proc.killed = true; + return true; + }); + return proc; + }; + const outputProcess = makeProcess({ + stdin: new Writable({ + write(_chunk, _encoding, done) { + done(); + }, + }), + stdout: null, + }); + const inputProcess = makeProcess({ stdout: inputStdout, stdin: null }); + const spawnMock = vi.fn().mockReturnValueOnce(outputProcess).mockReturnValueOnce(inputProcess); + const sessionStore: Record = {}; + const runtime = { + agent: { + resolveAgentDir: vi.fn(() => "/tmp/agent"), + resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"), + ensureAgentWorkspace: vi.fn(async () => {}), + session: { + resolveStorePath: vi.fn(() => "/tmp/sessions.json"), + loadSessionStore: vi.fn(() => sessionStore), + saveSessionStore: vi.fn(async () => {}), + updateSessionStore: vi.fn(async (_storePath, mutator) => mutator(sessionStore as never)), + resolveSessionFilePath: vi.fn(() => "/tmp/session.json"), + }, + runEmbeddedPiAgent: vi.fn(async () => ({ + payloads: [{ text: "The launch is still on track." }], + meta: {}, + })), + resolveAgentTimeoutMs: vi.fn(() => 1000), + }, + }; + + const handle = await startCommandRealtimeAudioBridge({ + config: resolveGoogleMeetConfig({ realtime: { provider: "openai", agentId: "jay" } }), + fullConfig: {} as never, + runtime: runtime as never, + meetingSessionId: "meet-1", + inputCommand: ["capture-meet"], + outputCommand: ["play-meet"], + logger: noopLogger, + providers: [provider], + spawn: spawnMock, + }); + + expect(callbacks).toMatchObject({ + autoRespondToAudio: false, + tools: [], + }); + callbacks?.onTranscript?.("user", "Are we still on track?", true); + callbacks?.onTranscript?.("user", "Please include launch blockers.", true); + + await vi.waitFor(() => { + expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalledTimes(1); + expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalledWith( + expect.objectContaining({ + agentId: "jay", + spawnedBy: "agent:jay:main", + sessionKey: "agent:jay:subagent:google-meet:meet-1", + sandboxSessionKey: "agent:jay:subagent:google-meet:meet-1", + }), + ); + }); + expect(JSON.stringify(runtime.agent.runEmbeddedPiAgent.mock.calls[0]?.[0])).toContain( + "Are we still on track?\\nPlease include launch blockers.", + ); + expect(sendUserMessage).toHaveBeenCalledWith( + expect.stringContaining(JSON.stringify("The launch is still on track.")), + ); + expect(sessionStore).toHaveProperty("agent:jay:subagent:google-meet:meet-1"); + + await handle.stop(); + }); + it("uses a local barge-in input command to clear active Chrome playback", async () => { let callbacks: | { @@ -3818,7 +3935,7 @@ describe("google-meet plugin", () => { const handle = await startNodeRealtimeAudioBridge({ config: resolveGoogleMeetConfig({ - realtime: { provider: "openai", model: "gpt-realtime" }, + realtime: { strategy: "bidi", provider: "openai", model: "gpt-realtime" }, }), fullConfig: {} as never, runtime: runtime as never, @@ -3901,6 +4018,7 @@ describe("google-meet plugin", () => { sampleRateHz: 24000, channels: 1, }, + autoRespondToAudio: true, tools: [ expect.objectContaining({ name: "openclaw_agent_consult", diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 69ba21eed06..1b1c9a8b9a5 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -150,6 +150,10 @@ const googleMeetConfigSchema = { advanced: true, }, "voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true }, + "realtime.strategy": { + label: "Realtime Strategy", + help: "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly.", + }, "realtime.provider": { label: "Realtime Provider", help: "Defaults to OpenAI; uses OPENAI_API_KEY when no provider config is set.", diff --git a/extensions/google-meet/openclaw.plugin.json b/extensions/google-meet/openclaw.plugin.json index 755ca2b8d4f..e8099751421 100644 --- a/extensions/google-meet/openclaw.plugin.json +++ b/extensions/google-meet/openclaw.plugin.json @@ -143,6 +143,10 @@ "label": "Voice Call Intro Message", "advanced": true }, + "realtime.strategy": { + "label": "Realtime Strategy", + "help": "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly." + }, "realtime.provider": { "label": "Realtime Provider", "help": "Defaults to OpenAI; uses OPENAI_API_KEY when no provider config is set." @@ -404,6 +408,11 @@ "type": "object", "additionalProperties": false, "properties": { + "strategy": { + "type": "string", + "enum": ["agent", "bidi"], + "default": "agent" + }, "provider": { "type": "string", "default": "openai" @@ -413,7 +422,7 @@ }, "instructions": { "type": "string", - "default": "You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering." + "default": "You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent strategy, wait for OpenClaw consult results and speak them exactly. In bidi strategy, answer directly and call openclaw_agent_consult for deeper reasoning, current information, or tools." }, "introMessage": { "type": "string", diff --git a/extensions/google-meet/src/agent-consult.ts b/extensions/google-meet/src/agent-consult.ts index 3d5894a29f5..3727dc84dc4 100644 --- a/extensions/google-meet/src/agent-consult.ts +++ b/extensions/google-meet/src/agent-consult.ts @@ -48,7 +48,8 @@ export async function consultOpenClawAgentForGoogleMeet(params: { transcript: Array<{ role: "user" | "assistant"; text: string }>; }): Promise<{ text: string }> { const agentId = normalizeAgentId(params.config.realtime.agentId); - const sessionKey = `agent:${agentId}:google-meet:${params.meetingSessionId}`; + const requesterSessionKey = `agent:${agentId}:main`; + const sessionKey = `agent:${agentId}:subagent:google-meet:${params.meetingSessionId}`; return await consultRealtimeVoiceAgent({ cfg: params.fullConfig, agentRuntime: params.runtime.agent, @@ -58,6 +59,7 @@ export async function consultOpenClawAgentForGoogleMeet(params: { messageProvider: "google-meet", lane: "google-meet", runIdPrefix: `google-meet:${params.meetingSessionId}`, + spawnedBy: requesterSessionKey, args: params.args, transcript: params.transcript, surface: "a private Google Meet", diff --git a/extensions/google-meet/src/cli.ts b/extensions/google-meet/src/cli.ts index 59c9d88f11b..edb78134f2e 100644 --- a/extensions/google-meet/src/cli.ts +++ b/extensions/google-meet/src/cli.ts @@ -353,6 +353,9 @@ function writeDoctorStatus(status: Awaited { + if (session.output === outputProcess) { + stopSession(session); + } + }); } function startOutputProcess(command: { command: string; args: string[] }) { @@ -241,7 +246,12 @@ function pushAudio(params: Record) { const audio = Buffer.from(base64, "base64"); session.lastOutputAt = new Date().toISOString(); session.lastOutputBytes += audio.byteLength; - session.output?.stdin?.write(audio); + try { + session.output?.stdin?.write(audio); + } catch { + stopSession(session); + throw new Error(`bridge is not open: ${bridgeId}`); + } return { bridgeId, ok: true }; } diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index ce86fe6d614..4fc1dfd37ec 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -15,6 +15,8 @@ import { import type { GoogleMeetConfig } from "./config.js"; import { getGoogleMeetRealtimeTranscriptHealth, + buildGoogleMeetSpeakExactUserMessage, + GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS, getGoogleMeetRealtimeEventHealth, recordGoogleMeetRealtimeTranscript, recordGoogleMeetRealtimeEvent, @@ -73,12 +75,95 @@ export async function startNodeRealtimeAudioBridge(params: { }); const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; const realtimeEvents: GoogleMeetRealtimeEventEntry[] = []; + const strategy = params.config.realtime.strategy; + let agentConsultActive = false; + let pendingAgentQuestion: string | undefined; + let agentConsultDebounceTimer: ReturnType | undefined; + const enqueueAgentConsultForUserTranscript = (question: string): void => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + } + agentConsultDebounceTimer = setTimeout(() => { + agentConsultDebounceTimer = undefined; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + }, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS); + agentConsultDebounceTimer.unref?.(); + }; + const runAgentConsultForUserTranscript = async (question: string): Promise => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + if (agentConsultActive) { + pendingAgentQuestion = trimmed; + return; + } + agentConsultActive = true; + let nextQuestion: string | undefined = trimmed; + try { + while (nextQuestion) { + if (stopped) { + return; + } + const currentQuestion = nextQuestion; + pendingAgentQuestion = undefined; + params.logger.info(`[google-meet] node realtime agent consult: ${currentQuestion}`); + const result = await consultOpenClawAgentForGoogleMeet({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + logger: params.logger, + meetingSessionId: params.meetingSessionId, + args: { + question: currentQuestion, + responseStyle: "Brief, natural spoken answer for a live meeting.", + }, + transcript, + }); + if (!stopped && result.text.trim()) { + bridge?.sendUserMessage(buildGoogleMeetSpeakExactUserMessage(result.text.trim())); + } + nextQuestion = pendingAgentQuestion; + } + } catch (error) { + params.logger.warn( + `[google-meet] node realtime agent consult failed: ${formatErrorMessage(error)}`, + ); + if (!stopped) { + bridge?.sendUserMessage( + buildGoogleMeetSpeakExactUserMessage( + "I hit an error while checking that. Please try again.", + ), + ); + } + } finally { + agentConsultActive = false; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + } + }; const stop = async () => { if (stopped) { return; } stopped = true; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + agentConsultDebounceTimer = undefined; + } try { bridge?.close(); } catch (error) { @@ -106,9 +191,11 @@ export async function startNodeRealtimeAudioBridge(params: { audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config), instructions: params.config.realtime.instructions, initialGreetingInstructions: params.config.realtime.introMessage, + autoRespondToAudio: strategy === "bidi", triggerGreetingOnReady: false, markStrategy: "ack-immediately", - tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy), + tools: + strategy === "bidi" ? resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy) : [], audioSink: { isOpen: () => !stopped, sendAudio: (audio) => { @@ -157,16 +244,32 @@ export async function startNodeRealtimeAudioBridge(params: { if (isFinal) { recordGoogleMeetRealtimeTranscript(transcript, role, text); params.logger.info(`[google-meet] node realtime ${role}: ${text}`); + if (role === "user" && strategy === "agent") { + enqueueAgentConsultForUserTranscript(text); + } } }, onEvent: (event) => { recordGoogleMeetRealtimeEvent(realtimeEvents, event); - if (event.type === "error" || event.type === "response.done") { + if ( + event.type === "error" || + event.type === "response.done" || + event.type === "input_audio_buffer.speech_started" || + event.type === "input_audio_buffer.speech_stopped" || + event.type === "conversation.item.input_audio_transcription.completed" || + event.type === "conversation.item.input_audio_transcription.failed" + ) { const detail = event.detail ? ` ${event.detail}` : ""; params.logger.info(`[google-meet] node realtime ${event.direction}:${event.type}${detail}`); } }, onToolCall: (event, session) => { + if (strategy !== "bidi") { + session.submitToolResult(event.callId || event.itemId, { + error: `Tool "${event.name}" is only available in bidi realtime strategy`, + }); + return; + } if (event.name !== GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME) { session.submitToolResult(event.callId || event.itemId, { error: `Tool "${event.name}" not available`, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index a398f215e33..2a54ae74bc7 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -99,10 +99,15 @@ export type GoogleMeetRealtimeEventEntry = RealtimeVoiceBridgeEvent & { at: string; }; +export const GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS = 900; + export function recordGoogleMeetRealtimeEvent( events: GoogleMeetRealtimeEventEntry[], event: RealtimeVoiceBridgeEvent, ) { + if (event.direction === "client" && event.type === "input_audio_buffer.append") { + return; + } events.push({ at: new Date().toISOString(), ...event }); if (events.length > 40) { events.splice(0, events.length - 40); @@ -173,6 +178,13 @@ export function resolveGoogleMeetRealtimeProvider(params: { }); } +export function buildGoogleMeetSpeakExactUserMessage(text: string): string { + return [ + "Speak this exact OpenClaw answer to the meeting, without adding, removing, or rephrasing words.", + `Answer: ${JSON.stringify(text)}`, + ].join("\n"); +} + export async function startCommandRealtimeAudioBridge(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; @@ -212,6 +224,7 @@ export async function startCommandRealtimeAudioBridge(params: { let lastOutputAtMs = 0; let lastOutputPlayableUntilMs = 0; let bargeInInputProcess: BridgeProcess | undefined; + let agentConsultDebounceTimer: ReturnType | undefined; const suppressInputForOutput = (audio: Buffer) => { const bytesPerMs = params.config.chrome.audioFormat === "g711-ulaw-8khz" ? 8 : 48; @@ -254,6 +267,10 @@ export async function startCommandRealtimeAudioBridge(params: { return; } stopped = true; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + agentConsultDebounceTimer = undefined; + } try { bridge?.close(); } catch (error) { @@ -279,6 +296,12 @@ export async function startCommandRealtimeAudioBridge(params: { } fail("audio output command")(error); }); + proc.stdin?.on?.("error", (error: Error) => { + if (proc !== outputProcess) { + return; + } + fail("audio output command")(error); + }); proc.on("exit", (code, signal) => { if (proc !== outputProcess) { return; @@ -310,6 +333,13 @@ export async function startCommandRealtimeAudioBridge(params: { ); terminateProcess(previousOutput, "SIGKILL"); }; + const writeOutputAudio = (audio: Buffer) => { + try { + outputProcess.stdin?.write(audio); + } catch (error) { + fail("audio output command")(error as Error); + } + }; const startHumanBargeInMonitor = () => { const commandArgv = params.config.chrome.bargeInInputCommand; if (!commandArgv) { @@ -384,17 +414,97 @@ export async function startCommandRealtimeAudioBridge(params: { fullConfig: params.fullConfig, providers: params.providers, }); + const strategy = params.config.realtime.strategy; const transcript: GoogleMeetRealtimeTranscriptEntry[] = []; const realtimeEvents: GoogleMeetRealtimeEventEntry[] = []; + let agentConsultActive = false; + let pendingAgentQuestion: string | undefined; + const enqueueAgentConsultForUserTranscript = (question: string): void => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed; + if (agentConsultDebounceTimer) { + clearTimeout(agentConsultDebounceTimer); + } + agentConsultDebounceTimer = setTimeout(() => { + agentConsultDebounceTimer = undefined; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + }, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS); + agentConsultDebounceTimer.unref?.(); + }; + const runAgentConsultForUserTranscript = async (question: string): Promise => { + const trimmed = question.trim(); + if (!trimmed || stopped) { + return; + } + if (agentConsultActive) { + pendingAgentQuestion = trimmed; + return; + } + agentConsultActive = true; + let nextQuestion: string | undefined = trimmed; + try { + while (nextQuestion) { + if (stopped) { + return; + } + const currentQuestion = nextQuestion; + pendingAgentQuestion = undefined; + params.logger.info(`[google-meet] realtime agent consult: ${currentQuestion}`); + const result = await consultOpenClawAgentForGoogleMeet({ + config: params.config, + fullConfig: params.fullConfig, + runtime: params.runtime, + logger: params.logger, + meetingSessionId: params.meetingSessionId, + args: { + question: currentQuestion, + responseStyle: "Brief, natural spoken answer for a live meeting.", + }, + transcript, + }); + if (!stopped && result.text.trim()) { + bridge?.sendUserMessage(buildGoogleMeetSpeakExactUserMessage(result.text.trim())); + } + nextQuestion = pendingAgentQuestion; + } + } catch (error) { + params.logger.warn( + `[google-meet] realtime agent consult failed: ${formatErrorMessage(error)}`, + ); + if (!stopped) { + bridge?.sendUserMessage( + buildGoogleMeetSpeakExactUserMessage( + "I hit an error while checking that. Please try again.", + ), + ); + } + } finally { + agentConsultActive = false; + const queuedQuestion = pendingAgentQuestion; + pendingAgentQuestion = undefined; + if (queuedQuestion && !stopped) { + void runAgentConsultForUserTranscript(queuedQuestion); + } + } + }; bridge = createRealtimeVoiceBridgeSession({ provider: resolved.provider, providerConfig: resolved.providerConfig, audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config), instructions: params.config.realtime.instructions, initialGreetingInstructions: params.config.realtime.introMessage, + autoRespondToAudio: strategy === "bidi", triggerGreetingOnReady: false, markStrategy: "ack-immediately", - tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy), + tools: + strategy === "bidi" ? resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy) : [], audioSink: { isOpen: () => !stopped, sendAudio: (audio) => { @@ -402,7 +512,7 @@ export async function startCommandRealtimeAudioBridge(params: { lastOutputAt = new Date().toISOString(); lastOutputBytes += audio.byteLength; suppressInputForOutput(audio); - outputProcess.stdin?.write(audio); + writeOutputAudio(audio); }, clearAudio: clearOutputPlayback, }, @@ -410,16 +520,32 @@ export async function startCommandRealtimeAudioBridge(params: { if (isFinal) { recordGoogleMeetRealtimeTranscript(transcript, role, text); params.logger.info(`[google-meet] realtime ${role}: ${text}`); + if (role === "user" && strategy === "agent") { + enqueueAgentConsultForUserTranscript(text); + } } }, onEvent: (event) => { recordGoogleMeetRealtimeEvent(realtimeEvents, event); - if (event.type === "error" || event.type === "response.done") { + if ( + event.type === "error" || + event.type === "response.done" || + event.type === "input_audio_buffer.speech_started" || + event.type === "input_audio_buffer.speech_stopped" || + event.type === "conversation.item.input_audio_transcription.completed" || + event.type === "conversation.item.input_audio_transcription.failed" + ) { const detail = event.detail ? ` ${event.detail}` : ""; params.logger.info(`[google-meet] realtime ${event.direction}:${event.type}${detail}`); } }, onToolCall: (event, session) => { + if (strategy !== "bidi") { + session.submitToolResult(event.callId || event.itemId, { + error: `Tool "${event.name}" is only available in bidi realtime strategy`, + }); + return; + } if (event.name !== GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME) { session.submitToolResult(event.callId || event.itemId, { error: `Tool "${event.name}" not available`, diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index a953b4353ce..daa5c2716a6 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -392,6 +392,7 @@ export class GoogleMeetRuntime { : "signed-in Google Chrome profile", realtime: { enabled: mode === "realtime", + strategy: this.params.config.realtime.strategy, provider: this.params.config.realtime.provider, model: this.params.config.realtime.model, toolPolicy: this.params.config.realtime.toolPolicy, diff --git a/extensions/google-meet/src/transports/chrome.ts b/extensions/google-meet/src/transports/chrome.ts index 10879988ca6..1d001717f43 100644 --- a/extensions/google-meet/src/transports/chrome.ts +++ b/extensions/google-meet/src/transports/chrome.ts @@ -217,6 +217,9 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef lastCaptionSpeaker?: string; lastCaptionText?: string; recentTranscript?: GoogleMeetChromeHealth["recentTranscript"]; + audioOutputRouted?: boolean; + audioOutputDeviceLabel?: string; + audioOutputRouteError?: string; manualActionRequired?: boolean; manualActionReason?: GoogleMeetChromeHealth["manualActionReason"]; manualActionMessage?: string; @@ -236,6 +239,9 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef lastCaptionSpeaker: parsed.lastCaptionSpeaker, lastCaptionText: parsed.lastCaptionText, recentTranscript: parsed.recentTranscript, + audioOutputRouted: parsed.audioOutputRouted, + audioOutputDeviceLabel: parsed.audioOutputDeviceLabel, + audioOutputRouteError: parsed.audioOutputRouteError, manualActionRequired: parsed.manualActionRequired, manualActionReason: parsed.manualActionReason, manualActionMessage: parsed.manualActionMessage, @@ -329,7 +335,7 @@ function meetStatusScript(params: { guestName: string; readOnly?: boolean; }) { - return `() => { + return `async () => { const text = (node) => (node?.innerText || node?.textContent || "").trim(); const allowMicrophone = ${JSON.stringify(params.allowMicrophone)}; const captureCaptions = ${JSON.stringify(params.captureCaptions)}; @@ -345,6 +351,9 @@ function meetStatusScript(params: { .join(" "); const buttonLabels = buttons.map(buttonLabel).filter(Boolean); const notes = []; + let audioOutputRouted; + let audioOutputDeviceLabel; + let audioOutputRouteError; const findButton = (pattern) => buttons.find((button) => { const label = buttonLabel(button); @@ -398,6 +407,55 @@ function meetStatusScript(params: { notes.push("Skipped Meet microphone prompt for observe-only mode."); } const inCall = buttons.some((button) => /leave call/i.test(button.getAttribute('aria-label') || text(button))); + const routeMeetAudioOutput = async () => { + if ( + !allowMicrophone || + typeof navigator === 'undefined' || + !navigator.mediaDevices?.enumerateDevices + ) return; + const mediaElements = [...document.querySelectorAll('audio, video')] + .filter((el) => typeof el.setSinkId === 'function'); + if (mediaElements.length === 0) return; + try { + const devices = await navigator.mediaDevices.enumerateDevices(); + const output = devices.find((device) => + device.kind === 'audiooutput' && /\\bBlackHole\\s+2ch\\b/i.test(device.label || '') + ) || devices.find((device) => + device.kind === 'audiooutput' && /\\bBlackHole\\b/i.test(device.label || '') + ); + if (!output?.deviceId) { + if (devices.some((device) => device.kind === 'audiooutput')) { + notes.push("BlackHole 2ch speaker output was not visible to Meet."); + } + return; + } + let routed = 0; + for (const element of mediaElements) { + if (element.sinkId !== output.deviceId) { + if (readOnly) { + continue; + } + await element.setSinkId(output.deviceId); + routed += 1; + } + } + audioOutputRouted = mediaElements.some((element) => element.sinkId === output.deviceId); + audioOutputDeviceLabel = output.label || "BlackHole 2ch"; + if (!readOnly && audioOutputRouted) { + notes.push( + routed > 0 + ? \`Routed Meet media output to \${audioOutputDeviceLabel}.\` + : \`Meet media output already routed to \${audioOutputDeviceLabel}.\` + ); + } + } catch (error) { + audioOutputRouteError = error?.message || String(error); + notes.push(\`Could not route Meet speaker output to BlackHole 2ch: \${audioOutputRouteError}\`); + } + }; + if (inCall) { + await routeMeetAudioOutput(); + } let captioning = false; let captionsEnabledAttempted = false; let transcriptLines = 0; @@ -520,6 +578,9 @@ function meetStatusScript(params: { lastCaptionSpeaker, lastCaptionText, recentTranscript, + audioOutputRouted, + audioOutputDeviceLabel, + audioOutputRouteError, manualActionRequired: Boolean(manualActionReason), manualActionReason, manualActionMessage, diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 8a391568eec..54c7d926192 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -71,6 +71,9 @@ export type GoogleMeetChromeHealth = { realtimeReady?: boolean; audioInputActive?: boolean; audioOutputActive?: boolean; + audioOutputRouted?: boolean; + audioOutputDeviceLabel?: string; + audioOutputRouteError?: string; lastInputAt?: string; lastOutputAt?: string; lastSuppressedInputAt?: string; @@ -100,6 +103,7 @@ export type GoogleMeetSession = { participantIdentity: string; realtime: { enabled: boolean; + strategy?: string; provider?: string; model?: string; toolPolicy: string; diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 04a08cd2ccf..810c1058c20 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -84,6 +84,9 @@ type SentRealtimeEvent = { session?: { input_audio_format?: string; output_audio_format?: string; + turn_detection?: { + create_response?: boolean; + }; }; }; @@ -415,6 +418,80 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(bridge.isConnected()).toBe(false); }); + it("can disable automatic audio turn responses for agent-routed voice loops", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + autoRespondToAudio: false, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + + expect(parseSent(socket)[0]?.session).toMatchObject({ + turn_detection: expect.objectContaining({ + create_response: false, + }), + }); + }); + + it("keeps assistant playback active on server VAD when automatic audio responses are disabled", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const onClearAudio = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + autoRespondToAudio: false, + onAudio, + onClearAudio, + }); + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); + await connecting; + + socket.emit( + "message", + Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })), + ); + socket.emit( + "message", + Buffer.from( + JSON.stringify({ + type: "response.audio.delta", + item_id: "item_1", + delta: Buffer.from("assistant audio").toString("base64"), + }), + ), + ); + socket.emit( + "message", + Buffer.from(JSON.stringify({ type: "input_audio_buffer.speech_started" })), + ); + + expect(onAudio).toHaveBeenCalledTimes(1); + expect(onClearAudio).not.toHaveBeenCalled(); + expect(parseSent(socket)).not.toContainEqual({ type: "response.cancel" }); + expect(parseSent(socket)).not.toContainEqual( + expect.objectContaining({ type: "conversation.item.truncate" }), + ); + }); + it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const bridge = provider.createBridge({ @@ -566,7 +643,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { ); }); - it("creates an explicit user item and audio response for manual speech", async () => { + it("creates an explicit user item and response for manual speech", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const onEvent = vi.fn(); const bridge = provider.createBridge({ @@ -604,11 +681,9 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { }, { type: "response.create", - response: { - output_modalities: ["audio", "text"], - }, }, ]); + expect(JSON.stringify(parseSent(socket).at(-1))).not.toContain("output_modalities"); expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "conversation.item.create" }); expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "response.create" }); }); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index 8ba56d613c2..c668242c308 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -266,12 +266,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { content: [{ type: "input_text", text }], }, }); - this.sendEvent({ - type: "response.create", - response: { - output_modalities: ["audio", "text"], - }, - }); + this.sendEvent({ type: "response.create" }); } triggerGreeting(instructions?: string): void { @@ -537,7 +532,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { threshold: cfg.vadThreshold ?? 0.5, prefix_padding_ms: cfg.prefixPaddingMs ?? 300, silence_duration_ms: cfg.silenceDurationMs ?? 500, - create_response: true, + create_response: cfg.autoRespondToAudio ?? true, }, temperature: cfg.temperature ?? 0.8, ...(cfg.tools && cfg.tools.length > 0 @@ -599,7 +594,9 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { } case "input_audio_buffer.speech_started": - this.handleBargeIn(); + if (this.config.autoRespondToAudio ?? true) { + this.handleBargeIn(); + } return; case "response.audio_transcript.delta": diff --git a/src/realtime-voice/agent-consult-runtime.ts b/src/realtime-voice/agent-consult-runtime.ts index df2a57bef75..a7ff9727463 100644 --- a/src/realtime-voice/agent-consult-runtime.ts +++ b/src/realtime-voice/agent-consult-runtime.ts @@ -39,6 +39,7 @@ export async function consultRealtimeVoiceAgent(params: { assistantLabel?: string; questionSourceLabel?: string; agentId?: string; + spawnedBy?: string | null; provider?: RunEmbeddedPiAgentParams["provider"]; model?: RunEmbeddedPiAgentParams["model"]; thinkLevel?: RunEmbeddedPiAgentParams["thinkLevel"]; @@ -73,6 +74,7 @@ export async function consultRealtimeVoiceAgent(params: { sessionKey: params.sessionKey, sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey), agentId, + spawnedBy: params.spawnedBy, messageProvider: params.messageProvider, sessionFile, workspaceDir, diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index 5e2f399cf5a..d40d420e026 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -86,6 +86,7 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { providerConfig: RealtimeVoiceProviderConfig; audioFormat?: RealtimeVoiceAudioFormat; instructions?: string; + autoRespondToAudio?: boolean; tools?: RealtimeVoiceTool[]; }; diff --git a/src/realtime-voice/session-runtime.test.ts b/src/realtime-voice/session-runtime.test.ts index 64fcb0ed8d0..db5752e75c5 100644 --- a/src/realtime-voice/session-runtime.test.ts +++ b/src/realtime-voice/session-runtime.test.ts @@ -79,6 +79,28 @@ describe("realtime voice bridge session runtime", () => { expect(request?.audioFormat).toEqual(REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ); }); + it("passes the audio auto-response preference to the provider bridge", () => { + let request: Parameters[0] | undefined; + const provider: RealtimeVoiceProviderPlugin = { + id: "test", + label: "Test", + isConfigured: () => true, + createBridge: (nextRequest) => { + request = nextRequest; + return makeBridge(); + }, + }; + + createRealtimeVoiceBridgeSession({ + provider, + providerConfig: {}, + autoRespondToAudio: false, + audioSink: { sendAudio: vi.fn() }, + }); + + expect(request?.autoRespondToAudio).toBe(false); + }); + it("can acknowledge provider marks without transport mark support", () => { let callbacks: Parameters[0] | undefined; const bridge = makeBridge(); diff --git a/src/realtime-voice/session-runtime.ts b/src/realtime-voice/session-runtime.ts index fb65a6cec20..fa43faf2bba 100644 --- a/src/realtime-voice/session-runtime.ts +++ b/src/realtime-voice/session-runtime.ts @@ -41,6 +41,7 @@ export type RealtimeVoiceBridgeSessionParams = { audioSink: RealtimeVoiceAudioSink; instructions?: string; initialGreetingInstructions?: string; + autoRespondToAudio?: boolean; markStrategy?: RealtimeVoiceMarkStrategy; triggerGreetingOnReady?: boolean; tools?: RealtimeVoiceTool[]; @@ -82,6 +83,7 @@ export function createRealtimeVoiceBridgeSession( providerConfig: params.providerConfig, audioFormat: params.audioFormat, instructions: params.instructions, + autoRespondToAudio: params.autoRespondToAudio, tools: params.tools, onAudio: (audio) => { if (canSendAudio()) {