diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dec5bab1c4..c764d3fcd95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai - Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc. - Voice Call: allow SecretRef-backed Twilio auth tokens and call-specific OpenAI/ElevenLabs TTS API keys through the plugin config surface. Fixes #68690. Thanks @joshavant. - Google Meet: clean stale chrome-node realtime audio bridges by URL before rejoining, expose active node bridge inspection, and tolerate transient node input pull failures instead of dropping the Meet session. Fixes #72371. (#72372) Thanks @BsnizND. +- Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND. - Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras. - Cron: treat isolated run-level agent failures as job errors even when no reply payload is produced, synthesizing a safe error payload so model/provider failures increment error counters and trigger failure notifications instead of clearing as successful. Fixes #43604; carries forward #43631. Thanks @SPFAdvisors. - Cron: preserve exact `NO_REPLY` tool results from isolated jobs with empty final assistant turns as quiet successes instead of surfacing incomplete-turn errors. Fixes #68452; carries forward #68453. Thanks @anyech. diff --git a/docs/providers/google.md b/docs/providers/google.md index 62ee32ca353..8c01ee10876 100644 --- a/docs/providers/google.md +++ b/docs/providers/google.md @@ -308,6 +308,9 @@ Gemini Live API for backend audio bridges such as Voice Call and Google Meet. | VAD start sensitivity | `...google.startSensitivity` | (unset) | | VAD end sensitivity | `...google.endSensitivity` | (unset) | | Silence duration | `...google.silenceDurationMs` | (unset) | +| Activity handling | `...google.activityHandling` | Google default, `start-of-activity-interrupts` | +| Turn coverage | `...google.turnCoverage` | Google default, `only-activity` | +| Disable auto VAD | `...google.automaticActivityDetectionDisabled` | `false` | | API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` | Example Voice Call realtime config: @@ -326,6 +329,8 @@ Example Voice Call realtime config: google: { model: "gemini-2.5-flash-native-audio-preview-12-2025", voice: "Kore", + activityHandling: "start-of-activity-interrupts", + turnCoverage: "only-activity", }, }, }, diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index e917116a0ff..b15a4cf394a 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -217,6 +217,7 @@ type TestBridgeProcess = { killed: boolean; kill: ReturnType; on: EventEmitter["on"]; + emit: EventEmitter["emit"]; }; describe("google-meet plugin", () => { @@ -1881,6 +1882,7 @@ describe("google-meet plugin", () => { let callbacks: | { onAudio: (audio: Buffer) => void; + onClearAudio: () => void; onMark?: (markName: string) => void; onToolCall?: (event: { itemId: string; @@ -1916,6 +1918,7 @@ describe("google-meet plugin", () => { }; const inputStdout = new PassThrough(); const outputStdinWrites: Buffer[] = []; + const replacementOutputStdinWrites: Buffer[] = []; const makeProcess = (stdio: { stdin?: { write(chunk: unknown): unknown } | null; stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null; @@ -1937,9 +1940,20 @@ describe("google-meet plugin", () => { done(); }, }); + const replacementOutputStdin = new Writable({ + write(chunk, _encoding, done) { + replacementOutputStdinWrites.push(Buffer.from(chunk)); + done(); + }, + }); const inputProcess = makeProcess({ stdout: inputStdout, stdin: null }); const outputProcess = makeProcess({ stdin: outputStdin, stdout: null }); - const spawnMock = vi.fn().mockReturnValueOnce(outputProcess).mockReturnValueOnce(inputProcess); + const replacementOutputProcess = makeProcess({ stdin: replacementOutputStdin, stdout: null }); + const spawnMock = vi + .fn() + .mockReturnValueOnce(outputProcess) + .mockReturnValueOnce(inputProcess) + .mockReturnValueOnce(replacementOutputProcess); const sessionStore: Record = {}; const runtime = { agent: { @@ -1977,6 +1991,8 @@ describe("google-meet plugin", () => { inputStdout.write(Buffer.from([1, 2, 3])); callbacks?.onAudio(Buffer.from([4, 5])); callbacks?.onMark?.("mark-1"); + callbacks?.onClearAudio(); + callbacks?.onAudio(Buffer.from([6, 7])); callbacks?.onReady?.(); callbacks?.onToolCall?.({ itemId: "item-1", @@ -1993,6 +2009,10 @@ describe("google-meet plugin", () => { }); expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2, 3])); expect(outputStdinWrites).toEqual([Buffer.from([4, 5])]); + expect(outputProcess.kill).toHaveBeenCalledWith("SIGTERM"); + expect(replacementOutputStdinWrites).toEqual([Buffer.from([6, 7])]); + outputProcess.emit("error", new Error("stale output process failed after clear")); + expect(bridge.close).not.toHaveBeenCalled(); expect(bridge.acknowledgeMark).toHaveBeenCalled(); expect(bridge.triggerGreeting).not.toHaveBeenCalled(); handle.speak("Say exactly: hello from the meeting."); @@ -2003,7 +2023,8 @@ describe("google-meet plugin", () => { audioInputActive: true, audioOutputActive: true, lastInputBytes: 3, - lastOutputBytes: 2, + lastOutputBytes: 4, + clearCount: 1, }); expect(callbacks).toMatchObject({ tools: [ @@ -2035,6 +2056,7 @@ describe("google-meet plugin", () => { let callbacks: | { onAudio: (audio: Buffer) => void; + onClearAudio: () => void; onToolCall?: (event: { itemId: string; callId: string; @@ -2114,6 +2136,7 @@ describe("google-meet plugin", () => { }); callbacks?.onAudio(Buffer.from([1, 2, 3])); + callbacks?.onClearAudio(); callbacks?.onReady?.(); callbacks?.onToolCall?.({ itemId: "item-1", @@ -2138,6 +2161,19 @@ describe("google-meet plugin", () => { }), ); }); + await vi.waitFor(() => { + expect(runtime.nodes.invoke).toHaveBeenCalledWith( + expect.objectContaining({ + nodeId: "node-1", + command: "googlemeet.chrome", + params: { + action: "clearAudio", + bridgeId: "bridge-1", + }, + timeoutMs: 5_000, + }), + ); + }); await vi.waitFor(() => { expect(bridge.submitToolResult).toHaveBeenCalledWith("tool-call-1", { text: "Use the launch update.", @@ -2166,6 +2202,7 @@ describe("google-meet plugin", () => { audioOutputActive: true, lastInputBytes: 3, lastOutputBytes: 3, + clearCount: 1, }); await handle.stop(); diff --git a/extensions/google-meet/node-host.test.ts b/extensions/google-meet/node-host.test.ts index a9faed6edef..01cd731c6ca 100644 --- a/extensions/google-meet/node-host.test.ts +++ b/extensions/google-meet/node-host.test.ts @@ -40,6 +40,83 @@ vi.mock("node:child_process", async (importOriginal) => { }); describe("google-meet node host bridge sessions", () => { + it("clears output playback without closing the active bridge when the old output exits", async () => { + const { handleGoogleMeetNodeHostCommand } = await import("./src/node-host.js"); + const originalPlatform = process.platform; + children.length = 0; + + Object.defineProperty(process, "platform", { configurable: true, value: "darwin" }); + try { + const start = JSON.parse( + await handleGoogleMeetNodeHostCommand( + JSON.stringify({ + action: "start", + url: "https://meet.google.com/xyz-abcd-uvw", + mode: "realtime", + launch: false, + audioInputCommand: ["mock-rec"], + audioOutputCommand: ["mock-play"], + }), + ), + ); + + expect(children).toHaveLength(2); + const firstOutput = children[0]; + + const cleared = JSON.parse( + await handleGoogleMeetNodeHostCommand( + JSON.stringify({ + action: "clearAudio", + bridgeId: start.bridgeId, + }), + ), + ); + + expect(cleared).toEqual({ bridgeId: start.bridgeId, ok: true, clearCount: 1 }); + expect(children).toHaveLength(3); + expect(firstOutput?.kill).toHaveBeenCalledWith("SIGTERM"); + + firstOutput?.emit("error", new Error("stale output failed after clear")); + firstOutput?.emit("exit", 0, "SIGTERM"); + + const status = JSON.parse( + await handleGoogleMeetNodeHostCommand( + JSON.stringify({ + action: "status", + bridgeId: start.bridgeId, + }), + ), + ); + + expect(status.bridge).toMatchObject({ + bridgeId: start.bridgeId, + closed: false, + clearCount: 1, + }); + + const audio = Buffer.from([1, 2, 3]); + await handleGoogleMeetNodeHostCommand( + JSON.stringify({ + action: "pushAudio", + bridgeId: start.bridgeId, + base64: audio.toString("base64"), + }), + ); + + expect(children[2]?.stdin?.write).toHaveBeenCalledWith(audio); + expect(firstOutput?.stdin?.write).not.toHaveBeenCalled(); + + await handleGoogleMeetNodeHostCommand( + JSON.stringify({ + action: "stop", + bridgeId: start.bridgeId, + }), + ); + } finally { + Object.defineProperty(process, "platform", { configurable: true, value: originalPlatform }); + } + }); + it("lists active bridge sessions and hides closed sessions", async () => { const { handleGoogleMeetNodeHostCommand } = await import("./src/node-host.js"); const originalPlatform = process.platform; diff --git a/extensions/google-meet/src/node-host.ts b/extensions/google-meet/src/node-host.ts index c1a7260f166..c10db654ecf 100644 --- a/extensions/google-meet/src/node-host.ts +++ b/extensions/google-meet/src/node-host.ts @@ -15,6 +15,7 @@ type NodeBridgeSession = { id: string; url?: string; mode?: string; + outputCommand: { command: string; args: string[] }; input?: ChildProcess; output?: ChildProcess; chunks: Buffer[]; @@ -23,9 +24,11 @@ type NodeBridgeSession = { createdAt: string; lastInputAt?: string; lastOutputAt?: string; + lastClearAt?: string; lastInputBytes: number; lastOutputBytes: number; closedAt?: string; + clearCount: number; }; const sessions = new Map(); @@ -110,6 +113,25 @@ function stopSession(session: NodeBridgeSession) { wake(session); } +function attachOutputProcessHandlers(session: NodeBridgeSession, outputProcess: ChildProcess) { + outputProcess.on("exit", () => { + if (session.output === outputProcess) { + stopSession(session); + } + }); + outputProcess.on("error", () => { + if (session.output === outputProcess) { + stopSession(session); + } + }); +} + +function startOutputProcess(command: { command: string; args: string[] }) { + return spawn(command.command, command.args, { + stdio: ["pipe", "ignore", "pipe"], + }); +} + function startCommandPair(params: { inputCommand: string[]; outputCommand: string[]; @@ -122,16 +144,16 @@ function startCommandPair(params: { id: `meet_node_${randomUUID()}`, url: params.url, mode: params.mode, + outputCommand: output, chunks: [], waiters: [], closed: false, createdAt: new Date().toISOString(), lastInputBytes: 0, lastOutputBytes: 0, + clearCount: 0, }; - const outputProcess = spawn(output.command, output.args, { - stdio: ["pipe", "ignore", "pipe"], - }); + const outputProcess = startOutputProcess(output); const inputProcess = spawn(input.command, input.args, { stdio: ["ignore", "pipe", "pipe"], }); @@ -148,9 +170,8 @@ function startCommandPair(params: { wake(session); }); inputProcess.on("exit", () => stopSession(session)); - outputProcess.on("exit", () => stopSession(session)); + attachOutputProcessHandlers(session, outputProcess); inputProcess.on("error", () => stopSession(session)); - outputProcess.on("error", () => stopSession(session)); sessions.set(session.id, session); return session; } @@ -224,6 +245,25 @@ function pushAudio(params: Record) { return { bridgeId, ok: true }; } +function clearAudio(params: Record) { + const bridgeId = readString(params.bridgeId); + if (!bridgeId) { + throw new Error("bridgeId required"); + } + const session = sessions.get(bridgeId); + if (!session || session.closed) { + throw new Error(`bridge is not open: ${bridgeId}`); + } + const previousOutput = session.output; + const outputProcess = startOutputProcess(session.outputCommand); + session.output = outputProcess; + attachOutputProcessHandlers(session, outputProcess); + session.clearCount += 1; + session.lastClearAt = new Date().toISOString(); + terminateChild(previousOutput); + return { bridgeId, ok: true, clearCount: session.clearCount }; +} + function startChrome(params: Record) { const url = readString(params.url); if (!url) { @@ -317,8 +357,11 @@ function bridgeStatus(params: Record) { createdAt: session.createdAt, lastInputAt: session.lastInputAt, lastOutputAt: session.lastOutputAt, + lastClearAt: session.lastClearAt, lastInputBytes: session.lastInputBytes, lastOutputBytes: session.lastOutputBytes, + clearCount: session.clearCount, + queuedInputChunks: session.chunks.length, } : bridgeId ? { bridgeId, closed: true } @@ -438,6 +481,9 @@ export async function handleGoogleMeetNodeHostCommand(paramsJSON?: string | null case "pushAudio": result = pushAudio(params); break; + case "clearAudio": + result = clearAudio(params); + break; case "stop": result = stopChrome(params); break; diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index 7c21f8d0a37..1d77df3723d 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -50,10 +50,12 @@ export async function startNodeRealtimeAudioBridge(params: { let realtimeReady = false; let lastInputAt: string | undefined; let lastOutputAt: string | undefined; + let lastClearAt: string | undefined; let lastInputBytes = 0; let lastOutputBytes = 0; let consecutiveInputErrors = 0; let lastInputError: string | undefined; + let clearCount = 0; const resolved = resolveGoogleMeetRealtimeProvider({ config: params.config, fullConfig: params.fullConfig, @@ -118,6 +120,26 @@ export async function startNodeRealtimeAudioBridge(params: { void stop(); }); }, + clearAudio: () => { + lastClearAt = new Date().toISOString(); + clearCount += 1; + void params.runtime.nodes + .invoke({ + nodeId: params.nodeId, + command: "googlemeet.chrome", + params: { + action: "clearAudio", + bridgeId: params.bridgeId, + }, + timeoutMs: 5_000, + }) + .catch((error) => { + params.logger.warn( + `[google-meet] node audio clear failed: ${formatErrorMessage(error)}`, + ); + void stop(); + }); + }, }, onTranscript: (role, text, isFinal) => { if (isFinal) { @@ -230,10 +252,12 @@ export async function startNodeRealtimeAudioBridge(params: { audioOutputActive: lastOutputBytes > 0, lastInputAt, lastOutputAt, + lastClearAt, lastInputBytes, lastOutputBytes, consecutiveInputErrors, lastInputError, + clearCount, bridgeClosed: stopped, }), stop, diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 42deefc2323..b40b820cc9c 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -91,9 +91,11 @@ export async function startCommandRealtimeAudioBridge(params: { const spawnFn: SpawnFn = params.spawn ?? ((command, args, options) => spawn(command, args, options) as unknown as BridgeProcess); - const outputProcess = spawnFn(output.command, output.args, { - stdio: ["pipe", "ignore", "pipe"], - }); + const spawnOutputProcess = () => + spawnFn(output.command, output.args, { + stdio: ["pipe", "ignore", "pipe"], + }); + let outputProcess = spawnOutputProcess(); const inputProcess = spawnFn(input.command, input.args, { stdio: ["ignore", "pipe", "pipe"], }); @@ -104,6 +106,8 @@ export async function startCommandRealtimeAudioBridge(params: { let lastOutputAt: string | undefined; let lastInputBytes = 0; let lastOutputBytes = 0; + let lastClearAt: string | undefined; + let clearCount = 0; const stop = async () => { if (stopped) { @@ -125,26 +129,53 @@ export async function startCommandRealtimeAudioBridge(params: { params.logger.warn(`[google-meet] ${label} failed: ${formatErrorMessage(error)}`); void stop(); }; + const attachOutputProcessHandlers = (proc: BridgeProcess) => { + proc.on("error", (error) => { + if (proc !== outputProcess) { + return; + } + fail("audio output command")(error); + }); + proc.on("exit", (code, signal) => { + if (proc !== outputProcess) { + return; + } + if (!stopped) { + params.logger.warn( + `[google-meet] audio output command exited (${code ?? signal ?? "done"})`, + ); + void stop(); + } + }); + proc.stderr?.on("data", (chunk) => { + params.logger.debug?.(`[google-meet] audio output: ${String(chunk).trim()}`); + }); + }; + const clearOutputPlayback = () => { + if (stopped) { + return; + } + const previousOutput = outputProcess; + outputProcess = spawnOutputProcess(); + attachOutputProcessHandlers(outputProcess); + clearCount += 1; + lastClearAt = new Date().toISOString(); + params.logger.debug?.( + `[google-meet] cleared realtime audio output buffer by restarting playback command`, + ); + previousOutput.kill("SIGTERM"); + }; inputProcess.on("error", fail("audio input command")); - outputProcess.on("error", fail("audio output command")); inputProcess.on("exit", (code, signal) => { if (!stopped) { params.logger.warn(`[google-meet] audio input command exited (${code ?? signal ?? "done"})`); void stop(); } }); - outputProcess.on("exit", (code, signal) => { - if (!stopped) { - params.logger.warn(`[google-meet] audio output command exited (${code ?? signal ?? "done"})`); - void stop(); - } - }); + attachOutputProcessHandlers(outputProcess); inputProcess.stderr?.on("data", (chunk) => { params.logger.debug?.(`[google-meet] audio input: ${String(chunk).trim()}`); }); - outputProcess.stderr?.on("data", (chunk) => { - params.logger.debug?.(`[google-meet] audio output: ${String(chunk).trim()}`); - }); const resolved = resolveGoogleMeetRealtimeProvider({ config: params.config, @@ -167,6 +198,7 @@ export async function startCommandRealtimeAudioBridge(params: { lastOutputBytes += muLaw.byteLength; outputProcess.stdin?.write(muLaw); }, + clearAudio: clearOutputPlayback, }, onTranscript: (role, text, isFinal) => { if (isFinal) { @@ -240,6 +272,8 @@ export async function startCommandRealtimeAudioBridge(params: { lastOutputAt, lastInputBytes, lastOutputBytes, + lastClearAt, + clearCount, bridgeClosed: stopped, }), stop, diff --git a/extensions/google-meet/src/transports/types.ts b/extensions/google-meet/src/transports/types.ts index 037b288beeb..d61ab85494f 100644 --- a/extensions/google-meet/src/transports/types.ts +++ b/extensions/google-meet/src/transports/types.ts @@ -31,10 +31,13 @@ export type GoogleMeetChromeHealth = { audioOutputActive?: boolean; lastInputAt?: string; lastOutputAt?: string; + lastClearAt?: string; lastInputBytes?: number; lastOutputBytes?: number; consecutiveInputErrors?: number; lastInputError?: string; + clearCount?: number; + queuedInputChunks?: number; browserUrl?: string; browserTitle?: string; bridgeClosed?: boolean; diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts index 626a75eb03c..4656e956214 100644 --- a/extensions/google/realtime-voice-provider.test.ts +++ b/extensions/google/realtime-voice-provider.test.ts @@ -77,6 +77,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => { temperature: 0.4, silenceDurationMs: 700, startSensitivity: "high", + activityHandling: "no_interruption", + turnCoverage: "turn_includes_only_activity", + automaticActivityDetectionDisabled: false, }, }, }, @@ -92,6 +95,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => { silenceDurationMs: 700, startSensitivity: "high", endSensitivity: undefined, + activityHandling: "no-interruption", + turnCoverage: "only-activity", + automaticActivityDetectionDisabled: false, enableAffectiveDialog: undefined, thinkingLevel: undefined, thinkingBudget: undefined, @@ -107,6 +113,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => { voice: "Kore", temperature: 0.3, startSensitivity: "low", + endSensitivity: "low", + activityHandling: "no-interruption", + turnCoverage: "only-activity", }, instructions: "Speak briefly.", tools: [ @@ -144,6 +153,14 @@ describe("buildGoogleRealtimeVoiceProvider", () => { }, }, outputAudioTranscription: {}, + realtimeInputConfig: { + activityHandling: "NO_INTERRUPTION", + automaticActivityDetection: { + startOfSpeechSensitivity: "START_SENSITIVITY_LOW", + endOfSpeechSensitivity: "END_SENSITIVITY_LOW", + }, + turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY", + }, tools: [ { functionDeclarations: [ @@ -240,6 +257,28 @@ describe("buildGoogleRealtimeVoiceProvider", () => { expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true }); }); + it("can disable automatic VAD for manual activity signaling experiments", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { + apiKey: "gemini-key", + automaticActivityDetectionDisabled: true, + }, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + + expect(lastConnectParams().config).toMatchObject({ + realtimeInputConfig: { + automaticActivityDetection: { + disabled: true, + }, + }, + }); + }); + it("sends text prompts as ordered client turns", async () => { const provider = buildGoogleRealtimeVoiceProvider(); const bridge = provider.createBridge({ diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts index 269792a42c8..83333ce6b52 100644 --- a/extensions/google/realtime-voice-provider.ts +++ b/extensions/google/realtime-voice-provider.ts @@ -1,8 +1,10 @@ import { randomUUID } from "node:crypto"; import { + ActivityHandling, EndSensitivity, Modality, StartSensitivity, + TurnCoverage, type FunctionDeclaration, type FunctionResponse, type LiveServerContent, @@ -34,6 +36,8 @@ const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700; type GoogleRealtimeSensitivity = "low" | "high"; type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high"; +type GoogleRealtimeActivityHandling = "start-of-activity-interrupts" | "no-interruption"; +type GoogleRealtimeTurnCoverage = "only-activity" | "all-input" | "audio-activity-and-all-video"; type GoogleRealtimeVoiceProviderConfig = { apiKey?: string; @@ -45,6 +49,9 @@ type GoogleRealtimeVoiceProviderConfig = { silenceDurationMs?: number; startSensitivity?: GoogleRealtimeSensitivity; endSensitivity?: GoogleRealtimeSensitivity; + activityHandling?: GoogleRealtimeActivityHandling; + turnCoverage?: GoogleRealtimeTurnCoverage; + automaticActivityDetectionDisabled?: boolean; enableAffectiveDialog?: boolean; thinkingLevel?: GoogleRealtimeThinkingLevel; thinkingBudget?: number; @@ -60,6 +67,9 @@ type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { silenceDurationMs?: number; startSensitivity?: GoogleRealtimeSensitivity; endSensitivity?: GoogleRealtimeSensitivity; + activityHandling?: GoogleRealtimeActivityHandling; + turnCoverage?: GoogleRealtimeTurnCoverage; + automaticActivityDetectionDisabled?: boolean; enableAffectiveDialog?: boolean; thinkingLevel?: GoogleRealtimeThinkingLevel; thinkingBudget?: number; @@ -105,6 +115,40 @@ function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefine : undefined; } +function asActivityHandling(value: unknown): GoogleRealtimeActivityHandling | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-"); + switch (normalized) { + case "start-of-activity-interrupts": + case "start-of-activity-interrupt": + case "interrupt": + case "interrupts": + return "start-of-activity-interrupts"; + case "no-interruption": + case "no-interruptions": + case "none": + return "no-interruption"; + default: + return undefined; + } +} + +function asTurnCoverage(value: unknown): GoogleRealtimeTurnCoverage | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-"); + switch (normalized) { + case "only-activity": + case "turn-includes-only-activity": + return "only-activity"; + case "all-input": + case "turn-includes-all-input": + return "all-input"; + case "audio-activity-and-all-video": + case "turn-includes-audio-activity-and-all-video": + return "audio-activity-and-all-video"; + default: + return undefined; + } +} + function resolveGoogleRealtimeProviderConfigRecord( config: Record, ): Record | undefined { @@ -140,6 +184,9 @@ function normalizeProviderConfig( silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs), startSensitivity: asSensitivity(raw?.startSensitivity), endSensitivity: asSensitivity(raw?.endSensitivity), + activityHandling: asActivityHandling(raw?.activityHandling), + turnCoverage: asTurnCoverage(raw?.turnCoverage), + automaticActivityDetectionDisabled: asBoolean(raw?.automaticActivityDetectionDisabled), enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog), thinkingLevel: asThinkingLevel(raw?.thinkingLevel), thinkingBudget: asFiniteNumber(raw?.thinkingBudget), @@ -176,6 +223,32 @@ function mapEndSensitivity( } } +function mapActivityHandling( + value: GoogleRealtimeActivityHandling | undefined, +): ActivityHandling | undefined { + switch (value) { + case "no-interruption": + return ActivityHandling.NO_INTERRUPTION; + case "start-of-activity-interrupts": + return ActivityHandling.START_OF_ACTIVITY_INTERRUPTS; + default: + return undefined; + } +} + +function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCoverage | undefined { + switch (value) { + case "only-activity": + return TurnCoverage.TURN_INCLUDES_ONLY_ACTIVITY; + case "all-input": + return TurnCoverage.TURN_INCLUDES_ALL_INPUT; + case "audio-activity-and-all-video": + return TurnCoverage.TURN_INCLUDES_AUDIO_ACTIVITY_AND_ALL_VIDEO; + default: + return undefined; + } +} + function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined { if (config.thinkingLevel) { return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] }; @@ -191,7 +264,12 @@ function buildRealtimeInputConfig( ): RealtimeInputConfig | undefined { const startSensitivity = mapStartSensitivity(config.startSensitivity); const endSensitivity = mapEndSensitivity(config.endSensitivity); + const activityHandling = mapActivityHandling(config.activityHandling); + const turnCoverage = mapTurnCoverage(config.turnCoverage); const automaticActivityDetection = { + ...(typeof config.automaticActivityDetectionDisabled === "boolean" + ? { disabled: config.automaticActivityDetectionDisabled } + : {}), ...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}), ...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}), ...(typeof config.prefixPaddingMs === "number" @@ -201,9 +279,12 @@ function buildRealtimeInputConfig( ? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) } : {}), }; - return Object.keys(automaticActivityDetection).length > 0 - ? { automaticActivityDetection } - : undefined; + const realtimeInputConfig = { + ...(Object.keys(automaticActivityDetection).length > 0 ? { automaticActivityDetection } : {}), + ...(activityHandling ? { activityHandling } : {}), + ...(turnCoverage ? { turnCoverage } : {}), + }; + return Object.keys(realtimeInputConfig).length > 0 ? realtimeInputConfig : undefined; } function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] { @@ -519,6 +600,9 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin silenceDurationMs: config.silenceDurationMs, startSensitivity: config.startSensitivity, endSensitivity: config.endSensitivity, + activityHandling: config.activityHandling, + turnCoverage: config.turnCoverage, + automaticActivityDetectionDisabled: config.automaticActivityDetectionDisabled, enableAffectiveDialog: config.enableAffectiveDialog, thinkingLevel: config.thinkingLevel, thinkingBudget: config.thinkingBudget,