From d73e2ee77456ed00c94f052549caa10c0c555cda Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 12:54:54 +0100 Subject: [PATCH] fix(google-meet): use PCM audio for Chrome realtime --- CHANGELOG.md | 1 + docs/.generated/config-baseline.sha256 | 4 +- docs/plugins/google-meet.md | 18 ++-- extensions/google-meet/index.test.ts | 40 +++++++-- extensions/google-meet/index.ts | 9 +- extensions/google-meet/openclaw.plugin.json | 28 ++++-- extensions/google-meet/src/cli.test.ts | 4 +- extensions/google-meet/src/config.ts | 88 +++++++++++++++++-- extensions/google-meet/src/realtime-node.ts | 12 ++- extensions/google-meet/src/realtime.ts | 15 +++- extensions/google-meet/src/setup.ts | 2 +- .../google/realtime-voice-provider.test.ts | 58 ++++++++++++ extensions/google/realtime-voice-provider.ts | 53 +++++++++-- .../openai/realtime-voice-provider.test.ts | 39 +++++++- extensions/openai/realtime-voice-provider.ts | 15 +++- src/plugin-sdk/realtime-voice.ts | 5 ++ src/realtime-voice/provider-types.ts | 27 +++++- src/realtime-voice/session-runtime.test.ts | 27 +++++- src/realtime-voice/session-runtime.ts | 9 +- 19 files changed, 395 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62e67d9fec6..638ef5f17d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -126,6 +126,7 @@ Docs: https://docs.openclaw.ai - Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc. - Voice Call: allow SecretRef-backed Twilio auth tokens and call-specific OpenAI/ElevenLabs TTS API keys through the plugin config surface. Fixes #68690. Thanks @joshavant. - Google Meet: clean stale chrome-node realtime audio bridges by URL before rejoining, expose active node bridge inspection, and tolerate transient node input pull failures instead of dropping the Meet session. Fixes #72371. (#72372) Thanks @BsnizND. +- Google Meet: use 24 kHz PCM16 for Chrome command-pair realtime audio by default, preserve legacy 8 kHz G.711 mu-law custom command pairs, and let realtime providers negotiate the selected bridge audio format. Fixes #72525. Thanks @BsnizND. - Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND. - Google Meet: add `realtime.agentId` so live meeting consults can target a named OpenClaw agent instead of always using `main`. (#72381) Thanks @BsnizND. - Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 07ff0a2de67..1effeeb67fa 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -5027142b42acd038bb3cd15e53a0d45293103448a3aee1072500352095e14242 config-baseline.json +33425d446eda183d3574ee754bb44e7e546ea33afa855fc979f94b1e102bf047 config-baseline.json ecb702eee54bcb697916944440e13208ac7a640a8e07f44072bb79e9284ca994 config-baseline.core.json 07963db49502132f26db396c56b36e018b110e6c55a68b3cb012d3ec96f43901 config-baseline.channel.json -ed65cefbef96f034ce2b73069d9d5bacc341a43489ff9b20a34d40956b877f79 config-baseline.plugin.json +13d038300d90d4dd064aa2ac79def867799d1be403cf9d3e81dfad35ef459a21 config-baseline.plugin.json diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 68d53f496ec..eee000ee0bc 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -336,7 +336,7 @@ Common failure checks: The Chrome realtime default uses two external tools: - `sox`: command-line audio utility. The plugin uses its `rec` and `play` - commands for the default 8 kHz G.711 mu-law audio bridge. + commands for the default 24 kHz PCM16 audio bridge. - `blackhole-2ch`: macOS virtual audio driver. It creates the `BlackHole 2ch` audio device that Chrome/Meet can route through. @@ -887,10 +887,13 @@ Defaults: opening duplicates - `chrome.waitForInCallMs: 20000`: wait for the Meet tab to report in-call before the realtime intro is triggered -- `chrome.audioInputCommand`: SoX `rec` command writing 8 kHz G.711 mu-law - audio to stdout -- `chrome.audioOutputCommand`: SoX `play` command reading 8 kHz G.711 mu-law - audio from stdin +- `chrome.audioFormat: "pcm16-24khz"`: command-pair audio format. Use + `"g711-ulaw-8khz"` only for legacy/custom command pairs that still emit + telephony audio. +- `chrome.audioInputCommand`: SoX `rec` command writing audio in + `chrome.audioFormat` +- `chrome.audioOutputCommand`: SoX `play` command reading audio in + `chrome.audioFormat` - `realtime.provider: "openai"` - `realtime.toolPolicy: "safe-read-only"` - `realtime.instructions`: brief spoken replies, with @@ -1313,8 +1316,9 @@ phone dial-in participation. Chrome realtime mode needs either: - `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the - realtime model bridge and pipes 8 kHz G.711 mu-law audio between those - commands and the selected realtime voice provider. + realtime model bridge and pipes audio in `chrome.audioFormat` between those + commands and the selected realtime voice provider. The default Chrome path is + 24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs. - `chrome.audioBridgeCommand`: an external bridge command owns the whole local audio path and must exit after starting or validating its daemon. diff --git a/extensions/google-meet/index.test.ts b/extensions/google-meet/index.test.ts index ac6e6d77e16..cccdbeac4c1 100644 --- a/extensions/google-meet/index.test.ts +++ b/extensions/google-meet/index.test.ts @@ -257,19 +257,21 @@ describe("google-meet plugin", () => { reuseExistingTab: true, autoJoin: true, waitForInCallMs: 20000, + audioFormat: "pcm16-24khz", audioInputCommand: [ "rec", "-q", "-t", "raw", "-r", - "8000", + "24000", "-c", "1", "-e", - "mu-law", + "signed-integer", "-b", - "8", + "16", + "-L", "-", ], audioOutputCommand: [ @@ -278,13 +280,14 @@ describe("google-meet plugin", () => { "-t", "raw", "-r", - "8000", + "24000", "-c", "1", "-e", - "mu-law", + "signed-integer", "-b", - "8", + "16", + "-L", "-", ], }, @@ -310,6 +313,21 @@ describe("google-meet plugin", () => { ).toBe("jay"); }); + it("keeps legacy command-pair audio format when custom commands omit a format", () => { + expect( + resolveGoogleMeetConfig({ + chrome: { + audioInputCommand: ["capture-legacy"], + audioOutputCommand: ["play-legacy"], + }, + }).chrome, + ).toMatchObject({ + audioFormat: "g711-ulaw-8khz", + audioInputCommand: ["capture-legacy"], + audioOutputCommand: ["play-legacy"], + }); + }); + it("uses env fallbacks for OAuth, preview, and default meeting values", () => { expect( resolveGoogleMeetConfigWithEnv( @@ -2085,6 +2103,11 @@ describe("google-meet plugin", () => { clearCount: 1, }); expect(callbacks).toMatchObject({ + audioFormat: { + encoding: "pcm16", + sampleRateHz: 24000, + channels: 1, + }, tools: [ expect.objectContaining({ name: "openclaw_agent_consult", @@ -2263,6 +2286,11 @@ describe("google-meet plugin", () => { handle.speak("Say exactly: hello from the node."); expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the node."); expect(callbacks).toMatchObject({ + audioFormat: { + encoding: "pcm16", + sampleRateHz: 24000, + channels: 1, + }, tools: [ expect.objectContaining({ name: "openclaw_agent_consult", diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 281d44ba112..4198df386e1 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -76,14 +76,19 @@ const googleMeetConfigSchema = { help: "Waits for Chrome to report that the Meet tab is in-call before the realtime intro speaks.", advanced: true, }, + "chrome.audioFormat": { + label: "Audio Format", + help: "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.", + advanced: true, + }, "chrome.audioInputCommand": { label: "Audio Input Command", - help: "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.", + help: "Command that writes meeting audio to stdout in chrome.audioFormat.", advanced: true, }, "chrome.audioOutputCommand": { label: "Audio Output Command", - help: "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.", + help: "Command that reads assistant audio from stdin in chrome.audioFormat.", advanced: true, }, "chrome.audioBridgeCommand": { label: "Audio Bridge Command", advanced: true }, diff --git a/extensions/google-meet/openclaw.plugin.json b/extensions/google-meet/openclaw.plugin.json index d5a308f9165..18a40ee20e7 100644 --- a/extensions/google-meet/openclaw.plugin.json +++ b/extensions/google-meet/openclaw.plugin.json @@ -56,12 +56,17 @@ }, "chrome.audioInputCommand": { "label": "Audio Input Command", - "help": "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.", + "help": "Command that writes meeting audio to stdout in chrome.audioFormat.", "advanced": true }, "chrome.audioOutputCommand": { "label": "Audio Output Command", - "help": "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.", + "help": "Command that reads assistant audio from stdin in chrome.audioFormat.", + "advanced": true + }, + "chrome.audioFormat": { + "label": "Audio Format", + "help": "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.", "advanced": true }, "chrome.audioBridgeCommand": { @@ -232,6 +237,11 @@ "type": "number", "default": 20000 }, + "audioFormat": { + "type": "string", + "enum": ["pcm16-24khz", "g711-ulaw-8khz"], + "default": "pcm16-24khz" + }, "audioInputCommand": { "type": "array", "default": [ @@ -240,13 +250,14 @@ "-t", "raw", "-r", - "8000", + "24000", "-c", "1", "-e", - "mu-law", + "signed-integer", "-b", - "8", + "16", + "-L", "-" ], "items": { @@ -261,13 +272,14 @@ "-t", "raw", "-r", - "8000", + "24000", "-c", "1", "-e", - "mu-law", + "signed-integer", "-b", - "8", + "16", + "-L", "-" ], "items": { diff --git a/extensions/google-meet/src/cli.test.ts b/extensions/google-meet/src/cli.test.ts index 446a60ffe66..cb6eb97eb0e 100644 --- a/extensions/google-meet/src/cli.test.ts +++ b/extensions/google-meet/src/cli.test.ts @@ -218,7 +218,7 @@ describe("google-meet CLI", () => { { id: "audio-bridge", ok: true, - message: "Chrome command-pair realtime audio bridge configured", + message: "Chrome command-pair realtime audio bridge configured (pcm16-24khz)", }, ], }), @@ -226,7 +226,7 @@ describe("google-meet CLI", () => { }).parseAsync(["googlemeet", "setup"], { from: "user" }); expect(stdout.output()).toContain("Google Meet setup: OK"); expect(stdout.output()).toContain( - "[ok] audio-bridge: Chrome command-pair realtime audio bridge configured", + "[ok] audio-bridge: Chrome command-pair realtime audio bridge configured (pcm16-24khz)", ); expect(stdout.output()).not.toContain('"checks"'); } finally { diff --git a/extensions/google-meet/src/config.ts b/extensions/google-meet/src/config.ts index 4e650a56e95..43a419899b8 100644 --- a/extensions/google-meet/src/config.ts +++ b/extensions/google-meet/src/config.ts @@ -10,6 +10,7 @@ import { export type GoogleMeetTransport = "chrome" | "chrome-node" | "twilio"; export type GoogleMeetMode = "realtime" | "transcribe"; +export type GoogleMeetChromeAudioFormat = "pcm16-24khz" | "g711-ulaw-8khz"; export type GoogleMeetToolPolicy = RealtimeVoiceAgentConsultToolPolicy; export type GoogleMeetConfig = { @@ -24,6 +25,7 @@ export type GoogleMeetConfig = { defaultMode: GoogleMeetMode; chrome: { audioBackend: "blackhole-2ch"; + audioFormat: GoogleMeetChromeAudioFormat; launch: boolean; browserProfile?: string; guestName: string; @@ -77,6 +79,40 @@ export type GoogleMeetConfig = { }; export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [ + "rec", + "-q", + "-t", + "raw", + "-r", + "24000", + "-c", + "1", + "-e", + "signed-integer", + "-b", + "16", + "-L", + "-", +] as const; + +export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ + "play", + "-q", + "-t", + "raw", + "-r", + "24000", + "-c", + "1", + "-e", + "signed-integer", + "-b", + "16", + "-L", + "-", +] as const; + +export const LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [ "rec", "-q", "-t", @@ -92,7 +128,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [ "-", ] as const; -export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ +export const LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ "play", "-q", "-t", @@ -108,6 +144,8 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [ "-", ] as const; +export const DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT: GoogleMeetChromeAudioFormat = "pcm16-24khz"; + export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`; export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening."; @@ -121,6 +159,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = { defaultMode: "realtime", chrome: { audioBackend: "blackhole-2ch", + audioFormat: DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT, launch: true, guestName: "OpenClaw Agent", reuseExistingTab: true, @@ -264,6 +303,37 @@ function resolveMode(value: unknown, fallback: GoogleMeetMode): GoogleMeetMode { return normalized === "realtime" || normalized === "transcribe" ? normalized : fallback; } +function resolveChromeAudioFormat(value: unknown): GoogleMeetChromeAudioFormat | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-"); + switch (normalized) { + case "pcm16-24khz": + case "pcm16-24k": + case "pcm24": + case "pcm": + return "pcm16-24khz"; + case "g711-ulaw-8khz": + case "g711-ulaw-8k": + case "g711-ulaw": + case "mulaw": + case "mu-law": + return "g711-ulaw-8khz"; + default: + return undefined; + } +} + +function defaultAudioInputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] { + return format === "g711-ulaw-8khz" + ? LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND + : DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND; +} + +function defaultAudioOutputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] { + return format === "g711-ulaw-8khz" + ? LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND + : DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND; +} + export function resolveGoogleMeetConfig(input: unknown): GoogleMeetConfig { return resolveGoogleMeetConfigWithEnv(input); } @@ -276,6 +346,13 @@ export function resolveGoogleMeetConfigWithEnv( const defaults = asRecord(raw.defaults); const preview = asRecord(raw.preview); const chrome = asRecord(raw.chrome); + const configuredAudioInputCommand = resolveStringArray(chrome.audioInputCommand); + const configuredAudioOutputCommand = resolveStringArray(chrome.audioOutputCommand); + const hasCustomAudioCommand = + configuredAudioInputCommand !== undefined || configuredAudioOutputCommand !== undefined; + const audioFormat = + resolveChromeAudioFormat(chrome.audioFormat) ?? + (hasCustomAudioCommand ? "g711-ulaw-8khz" : DEFAULT_GOOGLE_MEET_CONFIG.chrome.audioFormat); const chromeNode = asRecord(raw.chromeNode); const twilio = asRecord(raw.twilio); const voiceCall = asRecord(raw.voiceCall); @@ -304,6 +381,7 @@ export function resolveGoogleMeetConfigWithEnv( defaultMode: resolveMode(raw.defaultMode, DEFAULT_GOOGLE_MEET_CONFIG.defaultMode), chrome: { audioBackend: "blackhole-2ch", + audioFormat, launch: resolveBoolean(chrome.launch, DEFAULT_GOOGLE_MEET_CONFIG.chrome.launch), browserProfile: normalizeOptionalString(chrome.browserProfile), guestName: @@ -321,11 +399,9 @@ export function resolveGoogleMeetConfigWithEnv( chrome.waitForInCallMs, DEFAULT_GOOGLE_MEET_CONFIG.chrome.waitForInCallMs, ), - audioInputCommand: resolveStringArray(chrome.audioInputCommand) ?? [ - ...DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND, - ], - audioOutputCommand: resolveStringArray(chrome.audioOutputCommand) ?? [ - ...DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND, + audioInputCommand: configuredAudioInputCommand ?? [...defaultAudioInputCommand(audioFormat)], + audioOutputCommand: configuredAudioOutputCommand ?? [ + ...defaultAudioOutputCommand(audioFormat), ], audioBridgeCommand: resolveStringArray(chrome.audioBridgeCommand), audioBridgeHealthCommand: resolveStringArray(chrome.audioBridgeHealthCommand), diff --git a/extensions/google-meet/src/realtime-node.ts b/extensions/google-meet/src/realtime-node.ts index ca84bff11c7..2b6fd25c913 100644 --- a/extensions/google-meet/src/realtime-node.ts +++ b/extensions/google-meet/src/realtime-node.ts @@ -13,7 +13,10 @@ import { submitGoogleMeetConsultWorkingResponse, } from "./agent-consult.js"; import type { GoogleMeetConfig } from "./config.js"; -import { resolveGoogleMeetRealtimeProvider } from "./realtime.js"; +import { + resolveGoogleMeetRealtimeAudioFormat, + resolveGoogleMeetRealtimeProvider, +} from "./realtime.js"; import type { GoogleMeetChromeHealth } from "./transports/types.js"; export type ChromeNodeRealtimeAudioBridgeHandle = { @@ -93,6 +96,7 @@ export async function startNodeRealtimeAudioBridge(params: { bridge = createRealtimeVoiceBridgeSession({ provider: resolved.provider, providerConfig: resolved.providerConfig, + audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config), instructions: params.config.realtime.instructions, initialGreetingInstructions: params.config.realtime.introMessage, triggerGreetingOnReady: false, @@ -100,9 +104,9 @@ export async function startNodeRealtimeAudioBridge(params: { tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy), audioSink: { isOpen: () => !stopped, - sendAudio: (muLaw) => { + sendAudio: (audio) => { lastOutputAt = new Date().toISOString(); - lastOutputBytes += muLaw.byteLength; + lastOutputBytes += audio.byteLength; void params.runtime.nodes .invoke({ nodeId: params.nodeId, @@ -110,7 +114,7 @@ export async function startNodeRealtimeAudioBridge(params: { params: { action: "pushAudio", bridgeId: params.bridgeId, - base64: Buffer.from(muLaw).toString("base64"), + base64: Buffer.from(audio).toString("base64"), }, timeoutMs: 5_000, }) diff --git a/extensions/google-meet/src/realtime.ts b/extensions/google-meet/src/realtime.ts index 5e3585e769f..37dd307d196 100644 --- a/extensions/google-meet/src/realtime.ts +++ b/extensions/google-meet/src/realtime.ts @@ -5,6 +5,8 @@ import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime"; import { createRealtimeVoiceBridgeSession, + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, resolveConfiguredRealtimeVoiceProvider, type RealtimeVoiceBridgeSession, type RealtimeVoiceProviderConfig, @@ -61,6 +63,12 @@ function splitCommand(argv: string[]): { command: string; args: string[] } { return { command, args }; } +export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) { + return config.chrome.audioFormat === "g711-ulaw-8khz" + ? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ + : REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ; +} + export function resolveGoogleMeetRealtimeProvider(params: { config: GoogleMeetConfig; fullConfig: OpenClawConfig; @@ -187,6 +195,7 @@ export async function startCommandRealtimeAudioBridge(params: { bridge = createRealtimeVoiceBridgeSession({ provider: resolved.provider, providerConfig: resolved.providerConfig, + audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config), instructions: params.config.realtime.instructions, initialGreetingInstructions: params.config.realtime.introMessage, triggerGreetingOnReady: false, @@ -194,10 +203,10 @@ export async function startCommandRealtimeAudioBridge(params: { tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy), audioSink: { isOpen: () => !stopped, - sendAudio: (muLaw) => { + sendAudio: (audio) => { lastOutputAt = new Date().toISOString(); - lastOutputBytes += muLaw.byteLength; - outputProcess.stdin?.write(muLaw); + lastOutputBytes += audio.byteLength; + outputProcess.stdin?.write(audio); }, clearAudio: clearOutputPlayback, }, diff --git a/extensions/google-meet/src/setup.ts b/extensions/google-meet/src/setup.ts index f7bb65b638d..1d3d20c03e4 100644 --- a/extensions/google-meet/src/setup.ts +++ b/extensions/google-meet/src/setup.ts @@ -104,7 +104,7 @@ export function getGoogleMeetSetupStatus( message: config.chrome.audioBridgeCommand ? "Chrome audio bridge command configured" : config.chrome.audioInputCommand && config.chrome.audioOutputCommand - ? "Chrome command-pair realtime audio bridge configured" + ? `Chrome command-pair realtime audio bridge configured (${config.chrome.audioFormat})` : "Chrome realtime audio bridge not configured", }); diff --git a/extensions/google/realtime-voice-provider.test.ts b/extensions/google/realtime-voice-provider.test.ts index b481d8dc61f..d252df2b98c 100644 --- a/extensions/google/realtime-voice-provider.test.ts +++ b/extensions/google/realtime-voice-provider.test.ts @@ -1,3 +1,4 @@ +import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js"; @@ -281,6 +282,31 @@ describe("buildGoogleRealtimeVoiceProvider", () => { expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true }); }); + it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + await bridge.connect(); + lastConnectParams().callbacks.onopen(); + lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } }); + + bridge.sendAudio(Buffer.alloc(480)); + + expect(session.sendRealtimeInput).toHaveBeenCalledWith({ + audio: { + data: expect.any(String), + mimeType: "audio/pcm;rate=16000", + }, + }); + const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64"); + expect(sent).toHaveLength(320); + }); + it("can disable automatic VAD for manual activity signaling experiments", async () => { const provider = buildGoogleRealtimeVoiceProvider(); const bridge = provider.createBridge({ @@ -355,6 +381,38 @@ describe("buildGoogleRealtimeVoiceProvider", () => { expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80); }); + it("can keep Google PCM output as PCM16 24 kHz audio", async () => { + const provider = buildGoogleRealtimeVoiceProvider(); + const onAudio = vi.fn(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "gemini-key" }, + audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + onAudio, + onClearAudio: vi.fn(), + }); + const pcm24k = Buffer.alloc(480); + + await bridge.connect(); + lastConnectParams().callbacks.onmessage({ + setupComplete: { sessionId: "session-1" }, + serverContent: { + modelTurn: { + parts: [ + { + inlineData: { + mimeType: "audio/L16;codec=pcm;rate=24000", + data: pcm24k.toString("base64"), + }, + }, + ], + }, + }, + }); + + expect(onAudio).toHaveBeenCalledTimes(1); + expect(onAudio.mock.calls[0]?.[0]).toEqual(pcm24k); + }); + it("does not forward Google thought text as assistant transcript", async () => { const provider = buildGoogleRealtimeVoiceProvider(); const onTranscript = vi.fn(); diff --git a/extensions/google/realtime-voice-provider.ts b/extensions/google/realtime-voice-provider.ts index 29768f9d5bf..4244e60fc34 100644 --- a/extensions/google/realtime-voice-provider.ts +++ b/extensions/google/realtime-voice-provider.ts @@ -17,6 +17,7 @@ import { } from "@google/genai"; import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard"; import type { + RealtimeVoiceAudioFormat, RealtimeVoiceBridge, RealtimeVoiceBridgeCreateRequest, RealtimeVoiceProviderConfig, @@ -27,6 +28,7 @@ import type { import { convertPcmToMulaw8k, mulawToPcm, + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, resamplePcm, } from "openclaw/plugin-sdk/realtime-voice"; @@ -38,7 +40,6 @@ const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12- const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore"; const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta"; const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000; -const TELEPHONY_SAMPLE_RATE = 8000; const MAX_PENDING_AUDIO_CHUNKS = 320; const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700; @@ -319,6 +320,19 @@ function isMulawSilence(audio: Buffer): boolean { return audio.length > 0 && audio.every((sample) => sample === 0xff); } +function isPcm16Silence(audio: Buffer): boolean { + const samples = Math.floor(audio.length / 2); + if (samples === 0) { + return false; + } + for (let i = 0; i < samples; i += 1) { + if (audio.readInt16LE(i * 2) !== 0) { + return false; + } + } + return true; +} + class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { readonly supportsToolResultContinuation = true; @@ -331,8 +345,11 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { private consecutiveSilenceMs = 0; private audioStreamEnded = false; private pendingFunctionNames = new Map(); + private readonly audioFormat: RealtimeVoiceAudioFormat; - constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {} + constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) { + this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ; + } async connect(): Promise { this.intentionallyClosed = false; @@ -409,7 +426,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { } return; } - const silent = isMulawSilence(audio); + const silent = this.isSilence(audio); if (silent && this.audioStreamEnded) { return; } @@ -418,9 +435,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { this.audioStreamEnded = false; } + const pcm = this.toInputPcm(audio); const pcm16k = resamplePcm( - mulawToPcm(audio), - TELEPHONY_SAMPLE_RATE, + pcm, + this.audioFormat.sampleRateHz, GOOGLE_REALTIME_INPUT_SAMPLE_RATE, ); this.session.sendRealtimeInput({ @@ -438,7 +456,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { typeof this.config.silenceDurationMs === "number" ? Math.max(0, Math.floor(this.config.silenceDurationMs)) : DEFAULT_AUDIO_STREAM_END_SILENCE_MS; - this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000); + const bytesPerSample = this.audioFormat.encoding === "pcm16" ? 2 : 1; + this.consecutiveSilenceMs += Math.round( + (audio.length / bytesPerSample / this.audioFormat.sampleRateHz) * 1000, + ); if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) { this.session.sendRealtimeInput({ audioStreamEnd: true }); this.audioStreamEnded = true; @@ -536,6 +557,20 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { return this.connected && this.sessionConfigured; } + private isSilence(audio: Buffer): boolean { + return this.audioFormat.encoding === "pcm16" ? isPcm16Silence(audio) : isMulawSilence(audio); + } + + private toInputPcm(audio: Buffer): Buffer { + return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio); + } + + private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer { + return this.audioFormat.encoding === "pcm16" + ? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz) + : convertPcmToMulaw8k(pcm, sampleRate); + } + private handleMessage(message: LiveServerMessage): void { if (message.setupComplete) { this.handleSetupComplete(); @@ -585,9 +620,9 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge { if (part.inlineData?.data) { const pcm = Buffer.from(part.inlineData.data, "base64"); const sampleRate = parsePcmSampleRate(part.inlineData.mimeType); - const muLaw = convertPcmToMulaw8k(pcm, sampleRate); - if (muLaw.length > 0) { - this.config.onAudio(muLaw); + const audio = this.toOutputAudio(pcm, sampleRate); + if (audio.length > 0) { + this.config.onAudio(audio); this.config.onMark?.(`audio-${randomUUID()}`); } continue; diff --git a/extensions/openai/realtime-voice-provider.test.ts b/extensions/openai/realtime-voice-provider.test.ts index 1f995e574af..0e1b8d58f4c 100644 --- a/extensions/openai/realtime-voice-provider.test.ts +++ b/extensions/openai/realtime-voice-provider.test.ts @@ -1,3 +1,4 @@ +import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; @@ -56,7 +57,14 @@ vi.mock("ws", () => ({ })); type FakeWebSocketInstance = InstanceType; -type SentRealtimeEvent = { type: string; audio?: string }; +type SentRealtimeEvent = { + type: string; + audio?: string; + session?: { + input_audio_format?: string; + output_audio_format?: string; + }; +}; function parseSent(socket: FakeWebSocketInstance): SentRealtimeEvent[] { return socket.sent.map((payload: string) => JSON.parse(payload) as SentRealtimeEvent); @@ -118,6 +126,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(onReady).not.toHaveBeenCalled(); expect(parseSent(socket).map((event) => event.type)).toEqual(["session.update"]); + expect(parseSent(socket)[0]?.session).toMatchObject({ + input_audio_format: "g711_ulaw", + output_audio_format: "g711_ulaw", + }); expect(bridge.isConnected()).toBe(false); socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" }))); @@ -130,6 +142,31 @@ describe("buildOpenAIRealtimeVoiceProvider", () => { expect(bridge.isConnected()).toBe(true); }); + it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => { + const provider = buildOpenAIRealtimeVoiceProvider(); + const bridge = provider.createBridge({ + providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret + audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + onAudio: vi.fn(), + onClearAudio: vi.fn(), + }); + + const connecting = bridge.connect(); + const socket = FakeWebSocket.instances[0]; + if (!socket) { + throw new Error("expected bridge to create a websocket"); + } + + socket.readyState = FakeWebSocket.OPEN; + socket.emit("open"); + await connecting; + + expect(parseSent(socket)[0]?.session).toMatchObject({ + input_audio_format: "pcm16", + output_audio_format: "pcm16", + }); + }); + it("settles cleanly when closed before the websocket opens", async () => { const provider = buildOpenAIRealtimeVoiceProvider(); const onClose = vi.fn(); diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts index bb24feb8885..3a43da4c8d0 100644 --- a/extensions/openai/realtime-voice-provider.ts +++ b/extensions/openai/realtime-voice-provider.ts @@ -6,6 +6,7 @@ import { resolveDebugProxySettings, } from "openclaw/plugin-sdk/proxy-capture"; import type { + RealtimeVoiceAudioFormat, RealtimeVoiceBridge, RealtimeVoiceBrowserSession, RealtimeVoiceBrowserSessionCreateRequest, @@ -14,6 +15,7 @@ import type { RealtimeVoiceProviderPlugin, RealtimeVoiceTool, } from "openclaw/plugin-sdk/realtime-voice"; +import { REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ } from "openclaw/plugin-sdk/realtime-voice"; import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; import WebSocket from "ws"; @@ -141,8 +143,11 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { private toolCallBuffers = new Map(); private readonly flowId = randomUUID(); private sessionReadyFired = false; + private readonly audioFormat: RealtimeVoiceAudioFormat; - constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {} + constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) { + this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ; + } async connect(): Promise { this.intentionallyClosed = false; @@ -407,8 +412,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { modalities: ["text", "audio"], instructions: cfg.instructions, voice: cfg.voice ?? "alloy", - input_audio_format: "g711_ulaw", - output_audio_format: "g711_ulaw", + input_audio_format: this.resolveRealtimeAudioFormat(), + output_audio_format: this.resolveRealtimeAudioFormat(), input_audio_transcription: { model: "whisper-1", }, @@ -431,6 +436,10 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { this.sendEvent(sessionUpdate); } + private resolveRealtimeAudioFormat(): "g711_ulaw" | "pcm16" { + return this.audioFormat.encoding === "pcm16" ? "pcm16" : "g711_ulaw"; + } + private handleEvent(event: RealtimeEvent): void { switch (event.type) { case "session.created": diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts index 4ba819eda70..9966c347784 100644 --- a/src/plugin-sdk/realtime-voice.ts +++ b/src/plugin-sdk/realtime-voice.ts @@ -1,5 +1,6 @@ export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; export type { + RealtimeVoiceAudioFormat, RealtimeVoiceBridge, RealtimeVoiceBridgeCallbacks, RealtimeVoiceBrowserSession, @@ -15,6 +16,10 @@ export type { RealtimeVoiceToolCallEvent, RealtimeVoiceToolResultOptions, } from "../realtime-voice/provider-types.js"; +export { + REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ, + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, +} from "../realtime-voice/provider-types.js"; export { buildRealtimeVoiceAgentConsultChatMessage, buildRealtimeVoiceAgentConsultPrompt, diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts index 9c59ef7b0f1..46caa9ee58a 100644 --- a/src/realtime-voice/provider-types.ts +++ b/src/realtime-voice/provider-types.ts @@ -6,6 +6,30 @@ export type RealtimeVoiceRole = "user" | "assistant"; export type RealtimeVoiceCloseReason = "completed" | "error"; +export type RealtimeVoiceAudioFormat = + | { + encoding: "g711_ulaw"; + sampleRateHz: 8000; + channels: 1; + } + | { + encoding: "pcm16"; + sampleRateHz: 24000; + channels: 1; + }; + +export const REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ: RealtimeVoiceAudioFormat = { + encoding: "g711_ulaw", + sampleRateHz: 8000, + channels: 1, +}; + +export const REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ: RealtimeVoiceAudioFormat = { + encoding: "pcm16", + sampleRateHz: 24000, + channels: 1, +}; + export type RealtimeVoiceTool = { type: "function"; name: string; @@ -29,7 +53,7 @@ export type RealtimeVoiceToolResultOptions = { }; export type RealtimeVoiceBridgeCallbacks = { - onAudio: (muLaw: Buffer) => void; + onAudio: (audio: Buffer) => void; onClearAudio: () => void; onMark?: (markName: string) => void; onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void; @@ -53,6 +77,7 @@ export type RealtimeVoiceProviderConfiguredContext = { export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { providerConfig: RealtimeVoiceProviderConfig; + audioFormat?: RealtimeVoiceAudioFormat; instructions?: string; tools?: RealtimeVoiceTool[]; }; diff --git a/src/realtime-voice/session-runtime.test.ts b/src/realtime-voice/session-runtime.test.ts index 6bbdd89e5ac..64fcb0ed8d0 100644 --- a/src/realtime-voice/session-runtime.test.ts +++ b/src/realtime-voice/session-runtime.test.ts @@ -1,6 +1,9 @@ import { describe, expect, it, vi } from "vitest"; import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; -import type { RealtimeVoiceBridge } from "./provider-types.js"; +import { + REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + type RealtimeVoiceBridge, +} from "./provider-types.js"; import { createRealtimeVoiceBridgeSession } from "./session-runtime.js"; function makeBridge(overrides: Partial = {}): RealtimeVoiceBridge { @@ -54,6 +57,28 @@ describe("realtime voice bridge session runtime", () => { expect(sendMark).toHaveBeenCalledWith("mark-1"); }); + it("passes the requested audio format to the provider bridge", () => { + let request: Parameters[0] | undefined; + const provider: RealtimeVoiceProviderPlugin = { + id: "test", + label: "Test", + isConfigured: () => true, + createBridge: (nextRequest) => { + request = nextRequest; + return makeBridge(); + }, + }; + + createRealtimeVoiceBridgeSession({ + provider, + providerConfig: {}, + audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ, + audioSink: { sendAudio: vi.fn() }, + }); + + expect(request?.audioFormat).toEqual(REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ); + }); + it("can acknowledge provider marks without transport mark support", () => { let callbacks: Parameters[0] | undefined; const bridge = makeBridge(); diff --git a/src/realtime-voice/session-runtime.ts b/src/realtime-voice/session-runtime.ts index 0569fe292ac..1f188d30996 100644 --- a/src/realtime-voice/session-runtime.ts +++ b/src/realtime-voice/session-runtime.ts @@ -1,6 +1,7 @@ import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; import type { RealtimeVoiceBridge, + RealtimeVoiceAudioFormat, RealtimeVoiceCloseReason, RealtimeVoiceProviderConfig, RealtimeVoiceRole, @@ -11,7 +12,7 @@ import type { export type RealtimeVoiceAudioSink = { isOpen?: () => boolean; - sendAudio: (muLaw: Buffer) => void; + sendAudio: (audio: Buffer) => void; clearAudio?: () => void; sendMark?: (markName: string) => void; }; @@ -33,6 +34,7 @@ export type RealtimeVoiceBridgeSession = { export type RealtimeVoiceBridgeSessionParams = { provider: RealtimeVoiceProviderPlugin; providerConfig: RealtimeVoiceProviderConfig; + audioFormat?: RealtimeVoiceAudioFormat; audioSink: RealtimeVoiceAudioSink; instructions?: string; initialGreetingInstructions?: string; @@ -73,11 +75,12 @@ export function createRealtimeVoiceBridgeSession( const canSendAudio = () => params.audioSink.isOpen?.() ?? true; bridge = params.provider.createBridge({ providerConfig: params.providerConfig, + audioFormat: params.audioFormat, instructions: params.instructions, tools: params.tools, - onAudio: (muLaw) => { + onAudio: (audio) => { if (canSendAudio()) { - params.audioSink.sendAudio(muLaw); + params.audioSink.sendAudio(audio); } }, onClearAudio: () => {