mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:50:43 +00:00
fix(google-meet): use PCM audio for Chrome realtime
This commit is contained in:
@@ -126,6 +126,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc.
|
||||
- Voice Call: allow SecretRef-backed Twilio auth tokens and call-specific OpenAI/ElevenLabs TTS API keys through the plugin config surface. Fixes #68690. Thanks @joshavant.
|
||||
- Google Meet: clean stale chrome-node realtime audio bridges by URL before rejoining, expose active node bridge inspection, and tolerate transient node input pull failures instead of dropping the Meet session. Fixes #72371. (#72372) Thanks @BsnizND.
|
||||
- Google Meet: use 24 kHz PCM16 for Chrome command-pair realtime audio by default, preserve legacy 8 kHz G.711 mu-law custom command pairs, and let realtime providers negotiate the selected bridge audio format. Fixes #72525. Thanks @BsnizND.
|
||||
- Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND.
|
||||
- Google Meet: add `realtime.agentId` so live meeting consults can target a named OpenClaw agent instead of always using `main`. (#72381) Thanks @BsnizND.
|
||||
- Google Meet: route stateful `google_meet` tool actions through the gateway-owned runtime so created or joined realtime sessions remain visible to status, speak, and leave after the agent turn ends. Fixes #72440. (#72441) Thanks @BsnizND.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
5027142b42acd038bb3cd15e53a0d45293103448a3aee1072500352095e14242 config-baseline.json
|
||||
33425d446eda183d3574ee754bb44e7e546ea33afa855fc979f94b1e102bf047 config-baseline.json
|
||||
ecb702eee54bcb697916944440e13208ac7a640a8e07f44072bb79e9284ca994 config-baseline.core.json
|
||||
07963db49502132f26db396c56b36e018b110e6c55a68b3cb012d3ec96f43901 config-baseline.channel.json
|
||||
ed65cefbef96f034ce2b73069d9d5bacc341a43489ff9b20a34d40956b877f79 config-baseline.plugin.json
|
||||
13d038300d90d4dd064aa2ac79def867799d1be403cf9d3e81dfad35ef459a21 config-baseline.plugin.json
|
||||
|
||||
@@ -336,7 +336,7 @@ Common failure checks:
|
||||
The Chrome realtime default uses two external tools:
|
||||
|
||||
- `sox`: command-line audio utility. The plugin uses its `rec` and `play`
|
||||
commands for the default 8 kHz G.711 mu-law audio bridge.
|
||||
commands for the default 24 kHz PCM16 audio bridge.
|
||||
- `blackhole-2ch`: macOS virtual audio driver. It creates the `BlackHole 2ch`
|
||||
audio device that Chrome/Meet can route through.
|
||||
|
||||
@@ -887,10 +887,13 @@ Defaults:
|
||||
opening duplicates
|
||||
- `chrome.waitForInCallMs: 20000`: wait for the Meet tab to report in-call
|
||||
before the realtime intro is triggered
|
||||
- `chrome.audioInputCommand`: SoX `rec` command writing 8 kHz G.711 mu-law
|
||||
audio to stdout
|
||||
- `chrome.audioOutputCommand`: SoX `play` command reading 8 kHz G.711 mu-law
|
||||
audio from stdin
|
||||
- `chrome.audioFormat: "pcm16-24khz"`: command-pair audio format. Use
|
||||
`"g711-ulaw-8khz"` only for legacy/custom command pairs that still emit
|
||||
telephony audio.
|
||||
- `chrome.audioInputCommand`: SoX `rec` command writing audio in
|
||||
`chrome.audioFormat`
|
||||
- `chrome.audioOutputCommand`: SoX `play` command reading audio in
|
||||
`chrome.audioFormat`
|
||||
- `realtime.provider: "openai"`
|
||||
- `realtime.toolPolicy: "safe-read-only"`
|
||||
- `realtime.instructions`: brief spoken replies, with
|
||||
@@ -1313,8 +1316,9 @@ phone dial-in participation.
|
||||
Chrome realtime mode needs either:
|
||||
|
||||
- `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the
|
||||
realtime model bridge and pipes 8 kHz G.711 mu-law audio between those
|
||||
commands and the selected realtime voice provider.
|
||||
realtime model bridge and pipes audio in `chrome.audioFormat` between those
|
||||
commands and the selected realtime voice provider. The default Chrome path is
|
||||
24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs.
|
||||
- `chrome.audioBridgeCommand`: an external bridge command owns the whole local
|
||||
audio path and must exit after starting or validating its daemon.
|
||||
|
||||
|
||||
@@ -257,19 +257,21 @@ describe("google-meet plugin", () => {
|
||||
reuseExistingTab: true,
|
||||
autoJoin: true,
|
||||
waitForInCallMs: 20000,
|
||||
audioFormat: "pcm16-24khz",
|
||||
audioInputCommand: [
|
||||
"rec",
|
||||
"-q",
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"8000",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"mu-law",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"8",
|
||||
"16",
|
||||
"-L",
|
||||
"-",
|
||||
],
|
||||
audioOutputCommand: [
|
||||
@@ -278,13 +280,14 @@ describe("google-meet plugin", () => {
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"8000",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"mu-law",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"8",
|
||||
"16",
|
||||
"-L",
|
||||
"-",
|
||||
],
|
||||
},
|
||||
@@ -310,6 +313,21 @@ describe("google-meet plugin", () => {
|
||||
).toBe("jay");
|
||||
});
|
||||
|
||||
it("keeps legacy command-pair audio format when custom commands omit a format", () => {
|
||||
expect(
|
||||
resolveGoogleMeetConfig({
|
||||
chrome: {
|
||||
audioInputCommand: ["capture-legacy"],
|
||||
audioOutputCommand: ["play-legacy"],
|
||||
},
|
||||
}).chrome,
|
||||
).toMatchObject({
|
||||
audioFormat: "g711-ulaw-8khz",
|
||||
audioInputCommand: ["capture-legacy"],
|
||||
audioOutputCommand: ["play-legacy"],
|
||||
});
|
||||
});
|
||||
|
||||
it("uses env fallbacks for OAuth, preview, and default meeting values", () => {
|
||||
expect(
|
||||
resolveGoogleMeetConfigWithEnv(
|
||||
@@ -2085,6 +2103,11 @@ describe("google-meet plugin", () => {
|
||||
clearCount: 1,
|
||||
});
|
||||
expect(callbacks).toMatchObject({
|
||||
audioFormat: {
|
||||
encoding: "pcm16",
|
||||
sampleRateHz: 24000,
|
||||
channels: 1,
|
||||
},
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
name: "openclaw_agent_consult",
|
||||
@@ -2263,6 +2286,11 @@ describe("google-meet plugin", () => {
|
||||
handle.speak("Say exactly: hello from the node.");
|
||||
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the node.");
|
||||
expect(callbacks).toMatchObject({
|
||||
audioFormat: {
|
||||
encoding: "pcm16",
|
||||
sampleRateHz: 24000,
|
||||
channels: 1,
|
||||
},
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
name: "openclaw_agent_consult",
|
||||
|
||||
@@ -76,14 +76,19 @@ const googleMeetConfigSchema = {
|
||||
help: "Waits for Chrome to report that the Meet tab is in-call before the realtime intro speaks.",
|
||||
advanced: true,
|
||||
},
|
||||
"chrome.audioFormat": {
|
||||
label: "Audio Format",
|
||||
help: "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.",
|
||||
advanced: true,
|
||||
},
|
||||
"chrome.audioInputCommand": {
|
||||
label: "Audio Input Command",
|
||||
help: "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.",
|
||||
help: "Command that writes meeting audio to stdout in chrome.audioFormat.",
|
||||
advanced: true,
|
||||
},
|
||||
"chrome.audioOutputCommand": {
|
||||
label: "Audio Output Command",
|
||||
help: "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.",
|
||||
help: "Command that reads assistant audio from stdin in chrome.audioFormat.",
|
||||
advanced: true,
|
||||
},
|
||||
"chrome.audioBridgeCommand": { label: "Audio Bridge Command", advanced: true },
|
||||
|
||||
@@ -56,12 +56,17 @@
|
||||
},
|
||||
"chrome.audioInputCommand": {
|
||||
"label": "Audio Input Command",
|
||||
"help": "Command that writes 8 kHz G.711 mu-law meeting audio to stdout.",
|
||||
"help": "Command that writes meeting audio to stdout in chrome.audioFormat.",
|
||||
"advanced": true
|
||||
},
|
||||
"chrome.audioOutputCommand": {
|
||||
"label": "Audio Output Command",
|
||||
"help": "Command that reads 8 kHz G.711 mu-law assistant audio from stdin.",
|
||||
"help": "Command that reads assistant audio from stdin in chrome.audioFormat.",
|
||||
"advanced": true
|
||||
},
|
||||
"chrome.audioFormat": {
|
||||
"label": "Audio Format",
|
||||
"help": "Command-pair audio format. PCM16 24 kHz is the default Chrome/Meet path; G.711 mu-law 8 kHz remains available for legacy command pairs.",
|
||||
"advanced": true
|
||||
},
|
||||
"chrome.audioBridgeCommand": {
|
||||
@@ -232,6 +237,11 @@
|
||||
"type": "number",
|
||||
"default": 20000
|
||||
},
|
||||
"audioFormat": {
|
||||
"type": "string",
|
||||
"enum": ["pcm16-24khz", "g711-ulaw-8khz"],
|
||||
"default": "pcm16-24khz"
|
||||
},
|
||||
"audioInputCommand": {
|
||||
"type": "array",
|
||||
"default": [
|
||||
@@ -240,13 +250,14 @@
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"8000",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"mu-law",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"8",
|
||||
"16",
|
||||
"-L",
|
||||
"-"
|
||||
],
|
||||
"items": {
|
||||
@@ -261,13 +272,14 @@
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"8000",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"mu-law",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"8",
|
||||
"16",
|
||||
"-L",
|
||||
"-"
|
||||
],
|
||||
"items": {
|
||||
|
||||
@@ -218,7 +218,7 @@ describe("google-meet CLI", () => {
|
||||
{
|
||||
id: "audio-bridge",
|
||||
ok: true,
|
||||
message: "Chrome command-pair realtime audio bridge configured",
|
||||
message: "Chrome command-pair realtime audio bridge configured (pcm16-24khz)",
|
||||
},
|
||||
],
|
||||
}),
|
||||
@@ -226,7 +226,7 @@ describe("google-meet CLI", () => {
|
||||
}).parseAsync(["googlemeet", "setup"], { from: "user" });
|
||||
expect(stdout.output()).toContain("Google Meet setup: OK");
|
||||
expect(stdout.output()).toContain(
|
||||
"[ok] audio-bridge: Chrome command-pair realtime audio bridge configured",
|
||||
"[ok] audio-bridge: Chrome command-pair realtime audio bridge configured (pcm16-24khz)",
|
||||
);
|
||||
expect(stdout.output()).not.toContain('"checks"');
|
||||
} finally {
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
|
||||
export type GoogleMeetTransport = "chrome" | "chrome-node" | "twilio";
|
||||
export type GoogleMeetMode = "realtime" | "transcribe";
|
||||
export type GoogleMeetChromeAudioFormat = "pcm16-24khz" | "g711-ulaw-8khz";
|
||||
export type GoogleMeetToolPolicy = RealtimeVoiceAgentConsultToolPolicy;
|
||||
|
||||
export type GoogleMeetConfig = {
|
||||
@@ -24,6 +25,7 @@ export type GoogleMeetConfig = {
|
||||
defaultMode: GoogleMeetMode;
|
||||
chrome: {
|
||||
audioBackend: "blackhole-2ch";
|
||||
audioFormat: GoogleMeetChromeAudioFormat;
|
||||
launch: boolean;
|
||||
browserProfile?: string;
|
||||
guestName: string;
|
||||
@@ -77,6 +79,40 @@ export type GoogleMeetConfig = {
|
||||
};
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
|
||||
"rec",
|
||||
"-q",
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"16",
|
||||
"-L",
|
||||
"-",
|
||||
] as const;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
"play",
|
||||
"-q",
|
||||
"-t",
|
||||
"raw",
|
||||
"-r",
|
||||
"24000",
|
||||
"-c",
|
||||
"1",
|
||||
"-e",
|
||||
"signed-integer",
|
||||
"-b",
|
||||
"16",
|
||||
"-L",
|
||||
"-",
|
||||
] as const;
|
||||
|
||||
export const LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
|
||||
"rec",
|
||||
"-q",
|
||||
"-t",
|
||||
@@ -92,7 +128,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND = [
|
||||
"-",
|
||||
] as const;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
export const LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
"play",
|
||||
"-q",
|
||||
"-t",
|
||||
@@ -108,6 +144,8 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
"-",
|
||||
] as const;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT: GoogleMeetChromeAudioFormat = "pcm16-24khz";
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening.";
|
||||
|
||||
@@ -121,6 +159,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
defaultMode: "realtime",
|
||||
chrome: {
|
||||
audioBackend: "blackhole-2ch",
|
||||
audioFormat: DEFAULT_GOOGLE_MEET_CHROME_AUDIO_FORMAT,
|
||||
launch: true,
|
||||
guestName: "OpenClaw Agent",
|
||||
reuseExistingTab: true,
|
||||
@@ -264,6 +303,37 @@ function resolveMode(value: unknown, fallback: GoogleMeetMode): GoogleMeetMode {
|
||||
return normalized === "realtime" || normalized === "transcribe" ? normalized : fallback;
|
||||
}
|
||||
|
||||
function resolveChromeAudioFormat(value: unknown): GoogleMeetChromeAudioFormat | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
|
||||
switch (normalized) {
|
||||
case "pcm16-24khz":
|
||||
case "pcm16-24k":
|
||||
case "pcm24":
|
||||
case "pcm":
|
||||
return "pcm16-24khz";
|
||||
case "g711-ulaw-8khz":
|
||||
case "g711-ulaw-8k":
|
||||
case "g711-ulaw":
|
||||
case "mulaw":
|
||||
case "mu-law":
|
||||
return "g711-ulaw-8khz";
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function defaultAudioInputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] {
|
||||
return format === "g711-ulaw-8khz"
|
||||
? LEGACY_GOOGLE_MEET_AUDIO_INPUT_COMMAND
|
||||
: DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND;
|
||||
}
|
||||
|
||||
function defaultAudioOutputCommand(format: GoogleMeetChromeAudioFormat): readonly string[] {
|
||||
return format === "g711-ulaw-8khz"
|
||||
? LEGACY_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND
|
||||
: DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND;
|
||||
}
|
||||
|
||||
export function resolveGoogleMeetConfig(input: unknown): GoogleMeetConfig {
|
||||
return resolveGoogleMeetConfigWithEnv(input);
|
||||
}
|
||||
@@ -276,6 +346,13 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
const defaults = asRecord(raw.defaults);
|
||||
const preview = asRecord(raw.preview);
|
||||
const chrome = asRecord(raw.chrome);
|
||||
const configuredAudioInputCommand = resolveStringArray(chrome.audioInputCommand);
|
||||
const configuredAudioOutputCommand = resolveStringArray(chrome.audioOutputCommand);
|
||||
const hasCustomAudioCommand =
|
||||
configuredAudioInputCommand !== undefined || configuredAudioOutputCommand !== undefined;
|
||||
const audioFormat =
|
||||
resolveChromeAudioFormat(chrome.audioFormat) ??
|
||||
(hasCustomAudioCommand ? "g711-ulaw-8khz" : DEFAULT_GOOGLE_MEET_CONFIG.chrome.audioFormat);
|
||||
const chromeNode = asRecord(raw.chromeNode);
|
||||
const twilio = asRecord(raw.twilio);
|
||||
const voiceCall = asRecord(raw.voiceCall);
|
||||
@@ -304,6 +381,7 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
defaultMode: resolveMode(raw.defaultMode, DEFAULT_GOOGLE_MEET_CONFIG.defaultMode),
|
||||
chrome: {
|
||||
audioBackend: "blackhole-2ch",
|
||||
audioFormat,
|
||||
launch: resolveBoolean(chrome.launch, DEFAULT_GOOGLE_MEET_CONFIG.chrome.launch),
|
||||
browserProfile: normalizeOptionalString(chrome.browserProfile),
|
||||
guestName:
|
||||
@@ -321,11 +399,9 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
chrome.waitForInCallMs,
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.chrome.waitForInCallMs,
|
||||
),
|
||||
audioInputCommand: resolveStringArray(chrome.audioInputCommand) ?? [
|
||||
...DEFAULT_GOOGLE_MEET_AUDIO_INPUT_COMMAND,
|
||||
],
|
||||
audioOutputCommand: resolveStringArray(chrome.audioOutputCommand) ?? [
|
||||
...DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND,
|
||||
audioInputCommand: configuredAudioInputCommand ?? [...defaultAudioInputCommand(audioFormat)],
|
||||
audioOutputCommand: configuredAudioOutputCommand ?? [
|
||||
...defaultAudioOutputCommand(audioFormat),
|
||||
],
|
||||
audioBridgeCommand: resolveStringArray(chrome.audioBridgeCommand),
|
||||
audioBridgeHealthCommand: resolveStringArray(chrome.audioBridgeHealthCommand),
|
||||
|
||||
@@ -13,7 +13,10 @@ import {
|
||||
submitGoogleMeetConsultWorkingResponse,
|
||||
} from "./agent-consult.js";
|
||||
import type { GoogleMeetConfig } from "./config.js";
|
||||
import { resolveGoogleMeetRealtimeProvider } from "./realtime.js";
|
||||
import {
|
||||
resolveGoogleMeetRealtimeAudioFormat,
|
||||
resolveGoogleMeetRealtimeProvider,
|
||||
} from "./realtime.js";
|
||||
import type { GoogleMeetChromeHealth } from "./transports/types.js";
|
||||
|
||||
export type ChromeNodeRealtimeAudioBridgeHandle = {
|
||||
@@ -93,6 +96,7 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
bridge = createRealtimeVoiceBridgeSession({
|
||||
provider: resolved.provider,
|
||||
providerConfig: resolved.providerConfig,
|
||||
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
triggerGreetingOnReady: false,
|
||||
@@ -100,9 +104,9 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
audioSink: {
|
||||
isOpen: () => !stopped,
|
||||
sendAudio: (muLaw) => {
|
||||
sendAudio: (audio) => {
|
||||
lastOutputAt = new Date().toISOString();
|
||||
lastOutputBytes += muLaw.byteLength;
|
||||
lastOutputBytes += audio.byteLength;
|
||||
void params.runtime.nodes
|
||||
.invoke({
|
||||
nodeId: params.nodeId,
|
||||
@@ -110,7 +114,7 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
params: {
|
||||
action: "pushAudio",
|
||||
bridgeId: params.bridgeId,
|
||||
base64: Buffer.from(muLaw).toString("base64"),
|
||||
base64: Buffer.from(audio).toString("base64"),
|
||||
},
|
||||
timeoutMs: 5_000,
|
||||
})
|
||||
|
||||
@@ -5,6 +5,8 @@ import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
||||
import type { PluginRuntime, RuntimeLogger } from "openclaw/plugin-sdk/plugin-runtime";
|
||||
import {
|
||||
createRealtimeVoiceBridgeSession,
|
||||
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
|
||||
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
resolveConfiguredRealtimeVoiceProvider,
|
||||
type RealtimeVoiceBridgeSession,
|
||||
type RealtimeVoiceProviderConfig,
|
||||
@@ -61,6 +63,12 @@ function splitCommand(argv: string[]): { command: string; args: string[] } {
|
||||
return { command, args };
|
||||
}
|
||||
|
||||
export function resolveGoogleMeetRealtimeAudioFormat(config: GoogleMeetConfig) {
|
||||
return config.chrome.audioFormat === "g711-ulaw-8khz"
|
||||
? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ
|
||||
: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ;
|
||||
}
|
||||
|
||||
export function resolveGoogleMeetRealtimeProvider(params: {
|
||||
config: GoogleMeetConfig;
|
||||
fullConfig: OpenClawConfig;
|
||||
@@ -187,6 +195,7 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
bridge = createRealtimeVoiceBridgeSession({
|
||||
provider: resolved.provider,
|
||||
providerConfig: resolved.providerConfig,
|
||||
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
triggerGreetingOnReady: false,
|
||||
@@ -194,10 +203,10 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
audioSink: {
|
||||
isOpen: () => !stopped,
|
||||
sendAudio: (muLaw) => {
|
||||
sendAudio: (audio) => {
|
||||
lastOutputAt = new Date().toISOString();
|
||||
lastOutputBytes += muLaw.byteLength;
|
||||
outputProcess.stdin?.write(muLaw);
|
||||
lastOutputBytes += audio.byteLength;
|
||||
outputProcess.stdin?.write(audio);
|
||||
},
|
||||
clearAudio: clearOutputPlayback,
|
||||
},
|
||||
|
||||
@@ -104,7 +104,7 @@ export function getGoogleMeetSetupStatus(
|
||||
message: config.chrome.audioBridgeCommand
|
||||
? "Chrome audio bridge command configured"
|
||||
: config.chrome.audioInputCommand && config.chrome.audioOutputCommand
|
||||
? "Chrome command-pair realtime audio bridge configured"
|
||||
? `Chrome command-pair realtime audio bridge configured (${config.chrome.audioFormat})`
|
||||
: "Chrome realtime audio bridge not configured",
|
||||
});
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
|
||||
@@ -281,6 +282,31 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
|
||||
});
|
||||
|
||||
it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onopen();
|
||||
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
|
||||
|
||||
bridge.sendAudio(Buffer.alloc(480));
|
||||
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({
|
||||
audio: {
|
||||
data: expect.any(String),
|
||||
mimeType: "audio/pcm;rate=16000",
|
||||
},
|
||||
});
|
||||
const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
|
||||
expect(sent).toHaveLength(320);
|
||||
});
|
||||
|
||||
it("can disable automatic VAD for manual activity signaling experiments", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
@@ -355,6 +381,38 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
|
||||
});
|
||||
|
||||
it("can keep Google PCM output as PCM16 24 kHz audio", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onAudio = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
onAudio,
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
const pcm24k = Buffer.alloc(480);
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onmessage({
|
||||
setupComplete: { sessionId: "session-1" },
|
||||
serverContent: {
|
||||
modelTurn: {
|
||||
parts: [
|
||||
{
|
||||
inlineData: {
|
||||
mimeType: "audio/L16;codec=pcm;rate=24000",
|
||||
data: pcm24k.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(onAudio).toHaveBeenCalledTimes(1);
|
||||
expect(onAudio.mock.calls[0]?.[0]).toEqual(pcm24k);
|
||||
});
|
||||
|
||||
it("does not forward Google thought text as assistant transcript", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onTranscript = vi.fn();
|
||||
|
||||
@@ -17,6 +17,7 @@ import {
|
||||
} from "@google/genai";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
|
||||
import type {
|
||||
RealtimeVoiceAudioFormat,
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
@@ -27,6 +28,7 @@ import type {
|
||||
import {
|
||||
convertPcmToMulaw8k,
|
||||
mulawToPcm,
|
||||
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
|
||||
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
|
||||
resamplePcm,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
@@ -38,7 +40,6 @@ const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-
|
||||
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
|
||||
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
|
||||
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
|
||||
const TELEPHONY_SAMPLE_RATE = 8000;
|
||||
const MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
|
||||
|
||||
@@ -319,6 +320,19 @@ function isMulawSilence(audio: Buffer): boolean {
|
||||
return audio.length > 0 && audio.every((sample) => sample === 0xff);
|
||||
}
|
||||
|
||||
function isPcm16Silence(audio: Buffer): boolean {
|
||||
const samples = Math.floor(audio.length / 2);
|
||||
if (samples === 0) {
|
||||
return false;
|
||||
}
|
||||
for (let i = 0; i < samples; i += 1) {
|
||||
if (audio.readInt16LE(i * 2) !== 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
readonly supportsToolResultContinuation = true;
|
||||
|
||||
@@ -331,8 +345,11 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private consecutiveSilenceMs = 0;
|
||||
private audioStreamEnded = false;
|
||||
private pendingFunctionNames = new Map<string, string>();
|
||||
private readonly audioFormat: RealtimeVoiceAudioFormat;
|
||||
|
||||
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
|
||||
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {
|
||||
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.intentionallyClosed = false;
|
||||
@@ -409,7 +426,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
return;
|
||||
}
|
||||
const silent = isMulawSilence(audio);
|
||||
const silent = this.isSilence(audio);
|
||||
if (silent && this.audioStreamEnded) {
|
||||
return;
|
||||
}
|
||||
@@ -418,9 +435,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.audioStreamEnded = false;
|
||||
}
|
||||
|
||||
const pcm = this.toInputPcm(audio);
|
||||
const pcm16k = resamplePcm(
|
||||
mulawToPcm(audio),
|
||||
TELEPHONY_SAMPLE_RATE,
|
||||
pcm,
|
||||
this.audioFormat.sampleRateHz,
|
||||
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
|
||||
);
|
||||
this.session.sendRealtimeInput({
|
||||
@@ -438,7 +456,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
typeof this.config.silenceDurationMs === "number"
|
||||
? Math.max(0, Math.floor(this.config.silenceDurationMs))
|
||||
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
|
||||
this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
|
||||
const bytesPerSample = this.audioFormat.encoding === "pcm16" ? 2 : 1;
|
||||
this.consecutiveSilenceMs += Math.round(
|
||||
(audio.length / bytesPerSample / this.audioFormat.sampleRateHz) * 1000,
|
||||
);
|
||||
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
|
||||
this.session.sendRealtimeInput({ audioStreamEnd: true });
|
||||
this.audioStreamEnded = true;
|
||||
@@ -536,6 +557,20 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
return this.connected && this.sessionConfigured;
|
||||
}
|
||||
|
||||
private isSilence(audio: Buffer): boolean {
|
||||
return this.audioFormat.encoding === "pcm16" ? isPcm16Silence(audio) : isMulawSilence(audio);
|
||||
}
|
||||
|
||||
private toInputPcm(audio: Buffer): Buffer {
|
||||
return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio);
|
||||
}
|
||||
|
||||
private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer {
|
||||
return this.audioFormat.encoding === "pcm16"
|
||||
? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz)
|
||||
: convertPcmToMulaw8k(pcm, sampleRate);
|
||||
}
|
||||
|
||||
private handleMessage(message: LiveServerMessage): void {
|
||||
if (message.setupComplete) {
|
||||
this.handleSetupComplete();
|
||||
@@ -585,9 +620,9 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
if (part.inlineData?.data) {
|
||||
const pcm = Buffer.from(part.inlineData.data, "base64");
|
||||
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
|
||||
const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
|
||||
if (muLaw.length > 0) {
|
||||
this.config.onAudio(muLaw);
|
||||
const audio = this.toOutputAudio(pcm, sampleRate);
|
||||
if (audio.length > 0) {
|
||||
this.config.onAudio(audio);
|
||||
this.config.onMark?.(`audio-${randomUUID()}`);
|
||||
}
|
||||
continue;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
|
||||
@@ -56,7 +57,14 @@ vi.mock("ws", () => ({
|
||||
}));
|
||||
|
||||
type FakeWebSocketInstance = InstanceType<typeof FakeWebSocket>;
|
||||
type SentRealtimeEvent = { type: string; audio?: string };
|
||||
type SentRealtimeEvent = {
|
||||
type: string;
|
||||
audio?: string;
|
||||
session?: {
|
||||
input_audio_format?: string;
|
||||
output_audio_format?: string;
|
||||
};
|
||||
};
|
||||
|
||||
function parseSent(socket: FakeWebSocketInstance): SentRealtimeEvent[] {
|
||||
return socket.sent.map((payload: string) => JSON.parse(payload) as SentRealtimeEvent);
|
||||
@@ -118,6 +126,10 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
|
||||
expect(onReady).not.toHaveBeenCalled();
|
||||
expect(parseSent(socket).map((event) => event.type)).toEqual(["session.update"]);
|
||||
expect(parseSent(socket)[0]?.session).toMatchObject({
|
||||
input_audio_format: "g711_ulaw",
|
||||
output_audio_format: "g711_ulaw",
|
||||
});
|
||||
expect(bridge.isConnected()).toBe(false);
|
||||
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
@@ -130,6 +142,31 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
expect(bridge.isConnected()).toBe(true);
|
||||
});
|
||||
|
||||
it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
await connecting;
|
||||
|
||||
expect(parseSent(socket)[0]?.session).toMatchObject({
|
||||
input_audio_format: "pcm16",
|
||||
output_audio_format: "pcm16",
|
||||
});
|
||||
});
|
||||
|
||||
it("settles cleanly when closed before the websocket opens", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onClose = vi.fn();
|
||||
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
resolveDebugProxySettings,
|
||||
} from "openclaw/plugin-sdk/proxy-capture";
|
||||
import type {
|
||||
RealtimeVoiceAudioFormat,
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceBrowserSessionCreateRequest,
|
||||
@@ -14,6 +15,7 @@ import type {
|
||||
RealtimeVoiceProviderPlugin,
|
||||
RealtimeVoiceTool,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import WebSocket from "ws";
|
||||
@@ -141,8 +143,11 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
|
||||
private readonly flowId = randomUUID();
|
||||
private sessionReadyFired = false;
|
||||
private readonly audioFormat: RealtimeVoiceAudioFormat;
|
||||
|
||||
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
|
||||
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {
|
||||
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
|
||||
}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.intentionallyClosed = false;
|
||||
@@ -407,8 +412,8 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
modalities: ["text", "audio"],
|
||||
instructions: cfg.instructions,
|
||||
voice: cfg.voice ?? "alloy",
|
||||
input_audio_format: "g711_ulaw",
|
||||
output_audio_format: "g711_ulaw",
|
||||
input_audio_format: this.resolveRealtimeAudioFormat(),
|
||||
output_audio_format: this.resolveRealtimeAudioFormat(),
|
||||
input_audio_transcription: {
|
||||
model: "whisper-1",
|
||||
},
|
||||
@@ -431,6 +436,10 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.sendEvent(sessionUpdate);
|
||||
}
|
||||
|
||||
private resolveRealtimeAudioFormat(): "g711_ulaw" | "pcm16" {
|
||||
return this.audioFormat.encoding === "pcm16" ? "pcm16" : "g711_ulaw";
|
||||
}
|
||||
|
||||
private handleEvent(event: RealtimeEvent): void {
|
||||
switch (event.type) {
|
||||
case "session.created":
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
export type {
|
||||
RealtimeVoiceAudioFormat,
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCallbacks,
|
||||
RealtimeVoiceBrowserSession,
|
||||
@@ -15,6 +16,10 @@ export type {
|
||||
RealtimeVoiceToolCallEvent,
|
||||
RealtimeVoiceToolResultOptions,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
export {
|
||||
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
|
||||
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
export {
|
||||
buildRealtimeVoiceAgentConsultChatMessage,
|
||||
buildRealtimeVoiceAgentConsultPrompt,
|
||||
|
||||
@@ -6,6 +6,30 @@ export type RealtimeVoiceRole = "user" | "assistant";
|
||||
|
||||
export type RealtimeVoiceCloseReason = "completed" | "error";
|
||||
|
||||
export type RealtimeVoiceAudioFormat =
|
||||
| {
|
||||
encoding: "g711_ulaw";
|
||||
sampleRateHz: 8000;
|
||||
channels: 1;
|
||||
}
|
||||
| {
|
||||
encoding: "pcm16";
|
||||
sampleRateHz: 24000;
|
||||
channels: 1;
|
||||
};
|
||||
|
||||
export const REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ: RealtimeVoiceAudioFormat = {
|
||||
encoding: "g711_ulaw",
|
||||
sampleRateHz: 8000,
|
||||
channels: 1,
|
||||
};
|
||||
|
||||
export const REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ: RealtimeVoiceAudioFormat = {
|
||||
encoding: "pcm16",
|
||||
sampleRateHz: 24000,
|
||||
channels: 1,
|
||||
};
|
||||
|
||||
export type RealtimeVoiceTool = {
|
||||
type: "function";
|
||||
name: string;
|
||||
@@ -29,7 +53,7 @@ export type RealtimeVoiceToolResultOptions = {
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBridgeCallbacks = {
|
||||
onAudio: (muLaw: Buffer) => void;
|
||||
onAudio: (audio: Buffer) => void;
|
||||
onClearAudio: () => void;
|
||||
onMark?: (markName: string) => void;
|
||||
onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void;
|
||||
@@ -53,6 +77,7 @@ export type RealtimeVoiceProviderConfiguredContext = {
|
||||
|
||||
export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
audioFormat?: RealtimeVoiceAudioFormat;
|
||||
instructions?: string;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
};
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeVoiceBridge } from "./provider-types.js";
|
||||
import {
|
||||
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
type RealtimeVoiceBridge,
|
||||
} from "./provider-types.js";
|
||||
import { createRealtimeVoiceBridgeSession } from "./session-runtime.js";
|
||||
|
||||
function makeBridge(overrides: Partial<RealtimeVoiceBridge> = {}): RealtimeVoiceBridge {
|
||||
@@ -54,6 +57,28 @@ describe("realtime voice bridge session runtime", () => {
|
||||
expect(sendMark).toHaveBeenCalledWith("mark-1");
|
||||
});
|
||||
|
||||
it("passes the requested audio format to the provider bridge", () => {
|
||||
let request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "test",
|
||||
label: "Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: (nextRequest) => {
|
||||
request = nextRequest;
|
||||
return makeBridge();
|
||||
},
|
||||
};
|
||||
|
||||
createRealtimeVoiceBridgeSession({
|
||||
provider,
|
||||
providerConfig: {},
|
||||
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
|
||||
audioSink: { sendAudio: vi.fn() },
|
||||
});
|
||||
|
||||
expect(request?.audioFormat).toEqual(REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ);
|
||||
});
|
||||
|
||||
it("can acknowledge provider marks without transport mark support", () => {
|
||||
let callbacks: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
|
||||
const bridge = makeBridge();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceAudioFormat,
|
||||
RealtimeVoiceCloseReason,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceRole,
|
||||
@@ -11,7 +12,7 @@ import type {
|
||||
|
||||
export type RealtimeVoiceAudioSink = {
|
||||
isOpen?: () => boolean;
|
||||
sendAudio: (muLaw: Buffer) => void;
|
||||
sendAudio: (audio: Buffer) => void;
|
||||
clearAudio?: () => void;
|
||||
sendMark?: (markName: string) => void;
|
||||
};
|
||||
@@ -33,6 +34,7 @@ export type RealtimeVoiceBridgeSession = {
|
||||
export type RealtimeVoiceBridgeSessionParams = {
|
||||
provider: RealtimeVoiceProviderPlugin;
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
audioFormat?: RealtimeVoiceAudioFormat;
|
||||
audioSink: RealtimeVoiceAudioSink;
|
||||
instructions?: string;
|
||||
initialGreetingInstructions?: string;
|
||||
@@ -73,11 +75,12 @@ export function createRealtimeVoiceBridgeSession(
|
||||
const canSendAudio = () => params.audioSink.isOpen?.() ?? true;
|
||||
bridge = params.provider.createBridge({
|
||||
providerConfig: params.providerConfig,
|
||||
audioFormat: params.audioFormat,
|
||||
instructions: params.instructions,
|
||||
tools: params.tools,
|
||||
onAudio: (muLaw) => {
|
||||
onAudio: (audio) => {
|
||||
if (canSendAudio()) {
|
||||
params.audioSink.sendAudio(muLaw);
|
||||
params.audioSink.sendAudio(audio);
|
||||
}
|
||||
},
|
||||
onClearAudio: () => {
|
||||
|
||||
Reference in New Issue
Block a user