fix(google-meet): use PCM audio for Chrome realtime

This commit is contained in:
Peter Steinberger
2026-04-27 12:54:54 +01:00
parent 27a4bba90a
commit d73e2ee774
19 changed files with 395 additions and 59 deletions

View File

@@ -126,6 +126,7 @@ Docs: https://docs.openclaw.ai
- Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc.
- Voice Call: allow SecretRef-backed Twilio auth tokens and call-specific OpenAI/ElevenLabs TTS API keys through the plugin config surface. Fixes #68690. Thanks @joshavant.
- Google Meet: clean stale chrome-node realtime audio bridges by URL before rejoining, expose active node bridge inspection, and tolerate transient node input pull failures instead of dropping the Meet session. Fixes #72371. (#72372) Thanks @BsnizND.
- Google Meet: use 24 kHz PCM16 for Chrome command-pair realtime audio by default, preserve legacy 8 kHz G.711 mu-law custom command pairs, and let realtime providers negotiate the selected bridge audio format. Fixes #72525. Thanks @BsnizND.
- Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND.
- Google Meet: add `realtime.agentId` so live meeting consults can target a named OpenClaw agent instead of always using `main`. (#72381) Thanks @BsnizND.
- Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND.

View File

@@ -1,4 +1,4 @@
5027142b42acd038bb3cd15e53a0d45293103448a3aee1072500352095e14242 config-baseline.json
33425d446eda183d3574ee754bb44e7e546ea33afa855fc979f94b1e102bf047 config-baseline.json
ecb702eee54bcb697916944440e13208ac7a640a8e07f44072bb79e9284ca994 config-baseline.core.json
07963db49502132f26db396c56b36e018b110e6c55a68b3cb012d3ec96f43901 config-baseline.channel.json
ed65cefbef96f034ce2b73069d9d5bacc341a43489ff9b20a34d40956b877f79 config-baseline.plugin.json
13d038300d90d4dd064aa2ac79def867799d1be403cf9d3e81dfad35ef459a21 config-baseline.plugin.json

View File

@@ -336,7 +336,7 @@ Common failure checks:
The Chrome realtime default uses two external tools:
- `sox`: command-line audio utility. The plugin uses its `rec` and `play`
commands for the default 8 kHz G.711 mu-law audio bridge.
commands for the default 24 kHz PCM16 audio bridge.
- `blackhole-2ch`: macOS virtual audio driver. It creates the `BlackHole 2ch`
audio device that Chrome/Meet can route through.
@@ -887,10 +887,13 @@ Defaults:
opening duplicates
- `chrome.waitForInCallMs: 20000`: wait for the Meet tab to report in-call
before the realtime intro is triggered
- `chrome.audioInputCommand`: SoX `rec` command writing 8 kHz G.711 mu-law
audio to stdout
- `chrome.audioOutputCommand`: SoX `play` command reading 8 kHz G.711 mu-law
audio from stdin
- `chrome.audioFormat: "pcm16-24khz"`: command-pair audio format. Use
`"g711-ulaw-8khz"` only for legacy/custom command pairs that still emit
telephony audio.
- `chrome.audioInputCommand`: SoX `rec` command writing audio in
`chrome.audioFormat`
- `chrome.audioOutputCommand`: SoX `play` command reading audio in
`chrome.audioFormat`
- `realtime.provider: "openai"`
- `realtime.toolPolicy: "safe-read-only"`
- `realtime.instructions`: brief spoken replies, with
@@ -1313,8 +1316,9 @@ phone dial-in participation.
Chrome realtime mode needs either:
- `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the
realtime model bridge and pipes 8 kHz G.711 mu-law audio between those
commands and the selected realtime voice provider.
realtime model bridge and pipes audio in `chrome.audioFormat` between those
commands and the selected realtime voice provider. The default Chrome path is
24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs.
- `chrome.audioBridgeCommand`: an external bridge command owns the whole local
audio path and must exit after starting or validating its daemon.

View File

@@ -257,19 +257,21 @@ describe("google-meet plugin", () => {
reuseExistingTab: true,
autoJoin: true,
waitForInCallMs: 20000,
audioFormat: "pcm16-24khz",
audioInputCommand: [
"rec",
"-q",
"-t",
"raw",
"-r",
"8000",
"24000",
"-c",
"1",
"-e",
"mu-law",
"signed-integer",
"-b",
"8",
"16",
"-L",
"-",
],
audioOutputCommand: [
@@ -278,13 +280,14 @@ describe("google-meet plugin", () => {
"-t",
"raw",
"-r",
"8000",
"24000",
"-c",
"1",
"-e",
"mu-law",
"signed-integer",
"-b",
"8",
"16",
"-L",
"-",
],
},
@@ -310,6 +313,21 @@ describe("google-meet plugin", () => {
).toBe("jay");
});
it("keeps legacy command-pair audio format when custom commands omit a format", () => {
expect(
resolveGoogleMeetConfig({
chrome: {
audioInputCommand: ["capture-legacy"],
audioOutputCommand: ["play-legacy"],
},
}).chrome,
).toMatchObject({
audioFormat: "g711-ulaw-8khz",
audioInputCommand: ["capture-legacy"],
audioOutputCommand: ["play-legacy"],
});
});
it("uses env fallbacks for OAuth, preview, and default meeting values", () => {
expect(
resolveGoogleMeetConfigWithEnv(
@@ -2085,6 +2103,11 @@ describe("google-meet plugin", () => {
clearCount: 1,
});
expect(callbacks).toMatchObject({
audioFormat: {
encoding: "pcm16",
sampleRateHz: 24000,
channels: 1,
},
tools: [
expect.objectContaining({
name: "openclaw_agent_consult",
@@ -2263,6 +2286,11 @@ describe("google-meet plugin", () => {
handle.speak("Say exactly: hello from the node.");
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the node.");
expect(callbacks).toMatchObject({
audioFormat: {
encoding: "pcm16",
sampleRateHz: 24000,
channels: 1,
},
tools: [
expect.objectContaining({
name: "openclaw_agent_consult",

View File

@@ -76,14 +76,19 @@ const googleMeetConfigSchema = {
help: "Waits for Chrome to report that the Meet tab is in-call before the realtime intro speaks.",
advanced: true,
},
"chrome.audioFormat": {
label: "Audio Format",
help: "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.",
advanced: true,
},
"chrome.audioInputCommand": {
label: "Audio Input Command",
help: "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.",
help: "Command that writes meeting audio to stdout in chrome.audioFormat.",
advanced: true,
},
"chrome.audioOutputCommand": {
label: "Audio Output Command",
help: "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.",
help: "Command that reads assistant audio from stdin in chrome.audioFormat.",
advanced: true,
},
"chrome.audioBridgeCommand": { label: "Audio Bridge Command", advanced: true },

View File

@@ -56,12 +56,17 @@
},
"chrome.audioInputCommand": {
"label": "Audio Input Command",
"help": "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.",
"help": "Command that writes meeting audio to stdout in chrome.audioFormat.",
"advanced": true
},
"chrome.audioOutputCommand": {
"label": "Audio Output Command",
"help": "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.",
"help": "Command that reads assistant audio from stdin in chrome.audioFormat.",
"advanced": true
},
"chrome.audioFormat": {
"label": "Audio Format",
"help": "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.",
"advanced": true
},
"chrome.audioBridgeCommand": {
@@ -232,6 +237,11 @@
"type": "number",
"default": 20000
},
"audioFormat": {
"type": "string",
"enum": ["pcm16-24khz", "g711-ulaw-8khz"],
"default": "pcm16-24khz"
},
"audioInputCommand": {
"type": "array",
"default": [
@@ -240,13 +250,14 @@
"-t",
"raw",
"-r",
"8000",
"24000",
"-c",
"1",
"-e",
"mu-law",
"signed-integer",
"-b",
"8",
"16",
"-L",
"-"
],
"items": {
@@ -261,13 +272,14 @@
"-t",
"raw",
"-r",
"8000",
"24000",
"-c",
"1",
"-e",
"mu-law",
"signed-integer",
"-b",
"8",
"16",
"-L",
"-"
],
"items": {

View File

@@ -218,7 +218,7 @@ describe("google-meet CLI", () => {
{
id: "audio-bridge",
ok: true,
message: "Chrome command-pair realtime audio bridge configured",
message: "Chrome command-pair realtime audio bridge configured (pcm16-24khz)",
},
],
}),
@@ -226,7 +226,7 @@ describe("google-meet CLI", () => {
}).parseAsync(["googlemeet", "setup"], { from: "user" });
expect(stdout.output()).toContain("Google Meet setup: OK");
expect(stdout.output()).toContain(
"[ok] audio-bridge: Chrome command-pair realtime audio bridge configured",
"[ok] audio-bridge: Chrome command-pair realtime audio bridge configured (pcm16-24khz)",
);
expect(stdout.output()).not.toContain('"checks"');
} finally {

View File

@@ -10,6 +10,7 @@ import {
export type GoogleMeetTransport = "chrome" | "chrome-node" | "twilio";
export type GoogleMeetMode = "realtime" | "transcribe";
export type GoogleMeetChromeAudioFormat = "pcm16-24khz" | "g711-ulaw-8khz";
export type GoogleMeetToolPolicy = RealtimeVoiceAgentConsultToolPolicy;
export type GoogleMeetConfig = {
@@ -24,6 +25,7 @@ export type GoogleMeetConfig = {
defaultMode: GoogleMeetMode;
chrome: {
audioBackend: "blackhole-2ch";
audioFormat: GoogleMeetChromeAudioFormat;
launch: boolean;
browserProfile?: string;
guestName: string;
@@ -77,6 +79,40 @@ export type GoogleMeetConfig = {
};
export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
"rec",
"-q",
"-t",
"raw",
"-r",
"24000",
"-c",
"1",
"-e",
"signed-integer",
"-b",
"16",
"-L",
"-",
] as const;
export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
"play",
"-q",
"-t",
"raw",
"-r",
"24000",
"-c",
"1",
"-e",
"signed-integer",
"-b",
"16",
"-L",
"-",
] as const;
export const LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
"rec",
"-q",
"-t",
@@ -92,7 +128,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
"-",
] as const;
export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
export const LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
"play",
"-q",
"-t",
@@ -108,6 +144,8 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
"-",
] as const;
export const DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT: GoogleMeetChromeAudioFormat = "pcm16-24khz";
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening.";
@@ -121,6 +159,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
defaultMode: "realtime",
chrome: {
audioBackend: "blackhole-2ch",
audioFormat: DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT,
launch: true,
guestName: "OpenClaw Agent",
reuseExistingTab: true,
@@ -264,6 +303,37 @@ function resolveMode(value: unknown, fallback: GoogleMeetMode): GoogleMeetMode {
return normalized === "realtime" || normalized === "transcribe" ? normalized : fallback;
}
function resolveChromeAudioFormat(value: unknown): GoogleMeetChromeAudioFormat | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
switch (normalized) {
case "pcm16-24khz":
case "pcm16-24k":
case "pcm24":
case "pcm":
return "pcm16-24khz";
case "g711-ulaw-8khz":
case "g711-ulaw-8k":
case "g711-ulaw":
case "mulaw":
case "mu-law":
return "g711-ulaw-8khz";
default:
return undefined;
}
}
function defaultAudioInputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] {
return format === "g711-ulaw-8khz"
? LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND
: DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND;
}
function defaultAudioOutputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] {
return format === "g711-ulaw-8khz"
? LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND
: DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND;
}
export function resolveGoogleMeetConfig(input: unknown): GoogleMeetConfig {
return resolveGoogleMeetConfigWithEnv(input);
}
@@ -276,6 +346,13 @@ export function resolveGoogleMeetConfigWithEnv(
const defaults = asRecord(raw.defaults);
const preview = asRecord(raw.preview);
const chrome = asRecord(raw.chrome);
const configuredAudioInputCommand = resolveStringArray(chrome.audioInputCommand);
const configuredAudioOutputCommand = resolveStringArray(chrome.audioOutputCommand);
const hasCustomAudioCommand =
configuredAudioInputCommand !== undefined || configuredAudioOutputCommand !== undefined;
const audioFormat =
resolveChromeAudioFormat(chrome.audioFormat) ??
(hasCustomAudioCommand ? "g711-ulaw-8khz" : DEFAULT_GOOGLE_MEET_CONFIG.chrome.audioFormat);
const chromeNode = asRecord(raw.chromeNode);
const twilio = asRecord(raw.twilio);
const voiceCall = asRecord(raw.voiceCall);
@@ -304,6 +381,7 @@ export function resolveGoogleMeetConfigWithEnv(
defaultMode: resolveMode(raw.defaultMode, DEFAULT_GOOGLE_MEET_CONFIG.defaultMode),
chrome: {
audioBackend: "blackhole-2ch",
audioFormat,
launch: resolveBoolean(chrome.launch, DEFAULT_GOOGLE_MEET_CONFIG.chrome.launch),
browserProfile: normalizeOptionalString(chrome.browserProfile),
guestName:
@@ -321,11 +399,9 @@ export function resolveGoogleMeetConfigWithEnv(
chrome.waitForInCallMs,
DEFAULT_GOOGLE_MEET_CONFIG.chrome.waitForInCallMs,
),
audioInputCommand: resolveStringArray(chrome.audioInputCommand) ?? [
...DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND,
],
audioOutputCommand: resolveStringArray(chrome.audioOutputCommand) ?? [
...DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND,
audioInputCommand: configuredAudioInputCommand ?? [...defaultAudioInputCommand(audioFormat)],
audioOutputCommand: configuredAudioOutputCommand ?? [
...defaultAudioOutputCommand(audioFormat),
],
audioBridgeCommand: resolveStringArray(chrome.audioBridgeCommand),
audioBridgeHealthCommand: resolveStringArray(chrome.audioBridgeHealthCommand),

View File

@@ -13,7 +13,10 @@ import {
submitGoogleMeetConsultWorkingResponse,
} from "./agent-consult.js";
import type { GoogleMeetConfig } from "./config.js";
import { resolveGoogleMeetRealtimeProvider } from "./realtime.js";
import {
resolveGoogleMeetRealtimeAudioFormat,
resolveGoogleMeetRealtimeProvider,
} from "./realtime.js";
import type { GoogleMeetChromeHealth } from "./transports/types.js";
export type ChromeNodeRealtimeAudioBridgeHandle = {
@@ -93,6 +96,7 @@ export async function startNodeRealtimeAudioBridge(params: {
bridge = createRealtimeVoiceBridgeSession({
provider: resolved.provider,
providerConfig: resolved.providerConfig,
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
instructions: params.config.realtime.instructions,
initialGreetingInstructions: params.config.realtime.introMessage,
triggerGreetingOnReady: false,
@@ -100,9 +104,9 @@ export async function startNodeRealtimeAudioBridge(params: {
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
audioSink: {
isOpen: () => !stopped,
sendAudio: (muLaw) => {
sendAudio: (audio) => {
lastOutputAt = new Date().toISOString();
lastOutputBytes += muLaw.byteLength;
lastOutputBytes += audio.byteLength;
void params.runtime.nodes
.invoke({
nodeId: params.nodeId,
@@ -110,7 +114,7 @@ export async function startNodeRealtimeAudioBridge(params: {
params: {
action: "pushAudio",
bridgeId: params.bridgeId,
base64: Buffer.from(muLaw).toString("base64"),
base64: Buffer.from(audio).toString("base64"),
},
timeoutMs: 5_000,
})

View File

@@ -5,6 +5,8 @@ import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
import {
createRealtimeVoiceBridgeSession,
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
resolveConfiguredRealtimeVoiceProvider,
type RealtimeVoiceBridgeSession,
type RealtimeVoiceProviderConfig,
@@ -61,6 +63,12 @@ function splitCommand(argv: string[]): { command: string; args: string[] } {
return { command, args };
}
export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) {
return config.chrome.audioFormat === "g711-ulaw-8khz"
? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ
: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ;
}
export function resolveGoogleMeetRealtimeProvider(params: {
config: GoogleMeetConfig;
fullConfig: OpenClawConfig;
@@ -187,6 +195,7 @@ export async function startCommandRealtimeAudioBridge(params: {
bridge = createRealtimeVoiceBridgeSession({
provider: resolved.provider,
providerConfig: resolved.providerConfig,
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
instructions: params.config.realtime.instructions,
initialGreetingInstructions: params.config.realtime.introMessage,
triggerGreetingOnReady: false,
@@ -194,10 +203,10 @@ export async function startCommandRealtimeAudioBridge(params: {
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
audioSink: {
isOpen: () => !stopped,
sendAudio: (muLaw) => {
sendAudio: (audio) => {
lastOutputAt = new Date().toISOString();
lastOutputBytes += muLaw.byteLength;
outputProcess.stdin?.write(muLaw);
lastOutputBytes += audio.byteLength;
outputProcess.stdin?.write(audio);
},
clearAudio: clearOutputPlayback,
},

View File

@@ -104,7 +104,7 @@ export function getGoogleMeetSetupStatus(
message: config.chrome.audioBridgeCommand
? "Chrome audio bridge command configured"
: config.chrome.audioInputCommand && config.chrome.audioOutputCommand
? "Chrome command-pair realtime audio bridge configured"
? `Chrome command-pair realtime audio bridge configured (${config.chrome.audioFormat})`
: "Chrome realtime audio bridge not configured",
});

View File

@@ -1,3 +1,4 @@
import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice";
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
@@ -281,6 +282,31 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
});
it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
bridge.sendAudio(Buffer.alloc(480));
expect(session.sendRealtimeInput).toHaveBeenCalledWith({
audio: {
data: expect.any(String),
mimeType: "audio/pcm;rate=16000",
},
});
const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
expect(sent).toHaveLength(320);
});
it("can disable automatic VAD for manual activity signaling experiments", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
@@ -355,6 +381,38 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
});
it("can keep Google PCM output as PCM16 24 kHz audio", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onAudio = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
onAudio,
onClearAudio: vi.fn(),
});
const pcm24k = Buffer.alloc(480);
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
serverContent: {
modelTurn: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm24k.toString("base64"),
},
},
],
},
},
});
expect(onAudio).toHaveBeenCalledTimes(1);
expect(onAudio.mock.calls[0]?.[0]).toEqual(pcm24k);
});
it("does not forward Google thought text as assistant transcript", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onTranscript = vi.fn();

View File

@@ -17,6 +17,7 @@ import {
} from "@google/genai";
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
import type {
RealtimeVoiceAudioFormat,
RealtimeVoiceBridge,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
@@ -27,6 +28,7 @@ import type {
import {
convertPcmToMulaw8k,
mulawToPcm,
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
resamplePcm,
} from "openclaw/plugin-sdk/realtime-voice";
@@ -38,7 +40,6 @@ const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
const TELEPHONY_SAMPLE_RATE = 8000;
const MAX_PENDING_AUDIO_CHUNKS = 320;
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
@@ -319,6 +320,19 @@ function isMulawSilence(audio: Buffer): boolean {
return audio.length > 0 && audio.every((sample) => sample === 0xff);
}
function isPcm16Silence(audio: Buffer): boolean {
const samples = Math.floor(audio.length / 2);
if (samples === 0) {
return false;
}
for (let i = 0; i < samples; i += 1) {
if (audio.readInt16LE(i * 2) !== 0) {
return false;
}
}
return true;
}
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
readonly supportsToolResultContinuation = true;
@@ -331,8 +345,11 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
private consecutiveSilenceMs = 0;
private audioStreamEnded = false;
private pendingFunctionNames = new Map<string, string>();
private readonly audioFormat: RealtimeVoiceAudioFormat;
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
}
async connect(): Promise<void> {
this.intentionallyClosed = false;
@@ -409,7 +426,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
}
return;
}
const silent = isMulawSilence(audio);
const silent = this.isSilence(audio);
if (silent && this.audioStreamEnded) {
return;
}
@@ -418,9 +435,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.audioStreamEnded = false;
}
const pcm = this.toInputPcm(audio);
const pcm16k = resamplePcm(
mulawToPcm(audio),
TELEPHONY_SAMPLE_RATE,
pcm,
this.audioFormat.sampleRateHz,
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
);
this.session.sendRealtimeInput({
@@ -438,7 +456,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
typeof this.config.silenceDurationMs === "number"
? Math.max(0, Math.floor(this.config.silenceDurationMs))
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
const bytesPerSample = this.audioFormat.encoding === "pcm16" ? 2 : 1;
this.consecutiveSilenceMs += Math.round(
(audio.length / bytesPerSample / this.audioFormat.sampleRateHz) * 1000,
);
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
this.session.sendRealtimeInput({ audioStreamEnd: true });
this.audioStreamEnded = true;
@@ -536,6 +557,20 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
return this.connected && this.sessionConfigured;
}
private isSilence(audio: Buffer): boolean {
return this.audioFormat.encoding === "pcm16" ? isPcm16Silence(audio) : isMulawSilence(audio);
}
private toInputPcm(audio: Buffer): Buffer {
return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio);
}
private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer {
return this.audioFormat.encoding === "pcm16"
? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz)
: convertPcmToMulaw8k(pcm, sampleRate);
}
private handleMessage(message: LiveServerMessage): void {
if (message.setupComplete) {
this.handleSetupComplete();
@@ -585,9 +620,9 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
if (part.inlineData?.data) {
const pcm = Buffer.from(part.inlineData.data, "base64");
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
if (muLaw.length > 0) {
this.config.onAudio(muLaw);
const audio = this.toOutputAudio(pcm, sampleRate);
if (audio.length > 0) {
this.config.onAudio(audio);
this.config.onMark?.(`audio-${randomUUID()}`);
}
continue;

View File

@@ -1,3 +1,4 @@
import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice";
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
@@ -56,7 +57,14 @@ vi.mock("ws", () => ({
}));
type FakeWebSocketInstance = InstanceType<typeof FakeWebSocket>;
type SentRealtimeEvent = { type: string; audio?: string };
type SentRealtimeEvent = {
type: string;
audio?: string;
session?: {
input_audio_format?: string;
output_audio_format?: string;
};
};
function parseSent(socket: FakeWebSocketInstance): SentRealtimeEvent[] {
return socket.sent.map((payload: string) => JSON.parse(payload) as SentRealtimeEvent);
@@ -118,6 +126,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(onReady).not.toHaveBeenCalled();
expect(parseSent(socket).map((event) => event.type)).toEqual(["session.update"]);
expect(parseSent(socket)[0]?.session).toMatchObject({
input_audio_format: "g711_ulaw",
output_audio_format: "g711_ulaw",
});
expect(bridge.isConnected()).toBe(false);
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
@@ -130,6 +142,31 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
expect(bridge.isConnected()).toBe(true);
});
it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
const connecting = bridge.connect();
const socket = FakeWebSocket.instances[0];
if (!socket) {
throw new Error("expected bridge to create a websocket");
}
socket.readyState = FakeWebSocket.OPEN;
socket.emit("open");
await connecting;
expect(parseSent(socket)[0]?.session).toMatchObject({
input_audio_format: "pcm16",
output_audio_format: "pcm16",
});
});
it("settles cleanly when closed before the websocket opens", async () => {
const provider = buildOpenAIRealtimeVoiceProvider();
const onClose = vi.fn();

View File

@@ -6,6 +6,7 @@ import {
resolveDebugProxySettings,
} from "openclaw/plugin-sdk/proxy-capture";
import type {
RealtimeVoiceAudioFormat,
RealtimeVoiceBridge,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
@@ -14,6 +15,7 @@ import type {
RealtimeVoiceProviderPlugin,
RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ } from "openclaw/plugin-sdk/realtime-voice";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
import WebSocket from "ws";
@@ -141,8 +143,11 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
private readonly flowId = randomUUID();
private sessionReadyFired = false;
private readonly audioFormat: RealtimeVoiceAudioFormat;
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
}
async connect(): Promise<void> {
this.intentionallyClosed = false;
@@ -407,8 +412,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
modalities: ["text", "audio"],
instructions: cfg.instructions,
voice: cfg.voice ?? "alloy",
input_audio_format: "g711_ulaw",
output_audio_format: "g711_ulaw",
input_audio_format: this.resolveRealtimeAudioFormat(),
output_audio_format: this.resolveRealtimeAudioFormat(),
input_audio_transcription: {
model: "whisper-1",
},
@@ -431,6 +436,10 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
this.sendEvent(sessionUpdate);
}
private resolveRealtimeAudioFormat(): "g711_ulaw" | "pcm16" {
return this.audioFormat.encoding === "pcm16" ? "pcm16" : "g711_ulaw";
}
private handleEvent(event: RealtimeEvent): void {
switch (event.type) {
case "session.created":

View File

@@ -1,5 +1,6 @@
export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
export type {
RealtimeVoiceAudioFormat,
RealtimeVoiceBridge,
RealtimeVoiceBridgeCallbacks,
RealtimeVoiceBrowserSession,
@@ -15,6 +16,10 @@ export type {
RealtimeVoiceToolCallEvent,
RealtimeVoiceToolResultOptions,
} from "../realtime-voice/provider-types.js";
export {
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
} from "../realtime-voice/provider-types.js";
export {
buildRealtimeVoiceAgentConsultChatMessage,
buildRealtimeVoiceAgentConsultPrompt,

View File

@@ -6,6 +6,30 @@ export type RealtimeVoiceRole = "user" | "assistant";
export type RealtimeVoiceCloseReason = "completed" | "error";
export type RealtimeVoiceAudioFormat =
| {
encoding: "g711_ulaw";
sampleRateHz: 8000;
channels: 1;
}
| {
encoding: "pcm16";
sampleRateHz: 24000;
channels: 1;
};
export const REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ: RealtimeVoiceAudioFormat = {
encoding: "g711_ulaw",
sampleRateHz: 8000,
channels: 1,
};
export const REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ: RealtimeVoiceAudioFormat = {
encoding: "pcm16",
sampleRateHz: 24000,
channels: 1,
};
export type RealtimeVoiceTool = {
type: "function";
name: string;
@@ -29,7 +53,7 @@ export type RealtimeVoiceToolResultOptions = {
};
export type RealtimeVoiceBridgeCallbacks = {
onAudio: (muLaw: Buffer) => void;
onAudio: (audio: Buffer) => void;
onClearAudio: () => void;
onMark?: (markName: string) => void;
onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void;
@@ -53,6 +77,7 @@ export type RealtimeVoiceProviderConfiguredContext = {
export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
providerConfig: RealtimeVoiceProviderConfig;
audioFormat?: RealtimeVoiceAudioFormat;
instructions?: string;
tools?: RealtimeVoiceTool[];
};

View File

@@ -1,6 +1,9 @@
import { describe, expect, it, vi } from "vitest";
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import type { RealtimeVoiceBridge } from "./provider-types.js";
import {
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
type RealtimeVoiceBridge,
} from "./provider-types.js";
import { createRealtimeVoiceBridgeSession } from "./session-runtime.js";
function makeBridge(overrides: Partial<RealtimeVoiceBridge> = {}): RealtimeVoiceBridge {
@@ -54,6 +57,28 @@ describe("realtime voice bridge session runtime", () => {
expect(sendMark).toHaveBeenCalledWith("mark-1");
});
it("passes the requested audio format to the provider bridge", () => {
let request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
const provider: RealtimeVoiceProviderPlugin = {
id: "test",
label: "Test",
isConfigured: () => true,
createBridge: (nextRequest) => {
request = nextRequest;
return makeBridge();
},
};
createRealtimeVoiceBridgeSession({
provider,
providerConfig: {},
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
audioSink: { sendAudio: vi.fn() },
});
expect(request?.audioFormat).toEqual(REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ);
});
it("can acknowledge provider marks without transport mark support", () => {
let callbacks: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
const bridge = makeBridge();

View File

@@ -1,6 +1,7 @@
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import type {
RealtimeVoiceBridge,
RealtimeVoiceAudioFormat,
RealtimeVoiceCloseReason,
RealtimeVoiceProviderConfig,
RealtimeVoiceRole,
@@ -11,7 +12,7 @@ import type {
export type RealtimeVoiceAudioSink = {
isOpen?: () => boolean;
sendAudio: (muLaw: Buffer) => void;
sendAudio: (audio: Buffer) => void;
clearAudio?: () => void;
sendMark?: (markName: string) => void;
};
@@ -33,6 +34,7 @@ export type RealtimeVoiceBridgeSession = {
export type RealtimeVoiceBridgeSessionParams = {
provider: RealtimeVoiceProviderPlugin;
providerConfig: RealtimeVoiceProviderConfig;
audioFormat?: RealtimeVoiceAudioFormat;
audioSink: RealtimeVoiceAudioSink;
instructions?: string;
initialGreetingInstructions?: string;
@@ -73,11 +75,12 @@ export function createRealtimeVoiceBridgeSession(
const canSendAudio = () => params.audioSink.isOpen?.() ?? true;
bridge = params.provider.createBridge({
providerConfig: params.providerConfig,
audioFormat: params.audioFormat,
instructions: params.instructions,
tools: params.tools,
onAudio: (muLaw) => {
onAudio: (audio) => {
if (canSendAudio()) {
params.audioSink.sendAudio(muLaw);
params.audioSink.sendAudio(audio);
}
},
onClearAudio: () => {