feat: add browser realtime talk transports

This commit is contained in:
Peter Steinberger
2026-04-27 14:21:38 +01:00
parent 5dd1e264eb
commit 93bbbe5e37
26 changed files with 2607 additions and 319 deletions

View File

@@ -20,7 +20,7 @@ type MockGoogleLiveConnectParams = {
};
};
const { connectMock, session } = vi.hoisted(() => {
const { connectMock, createTokenMock, session } = vi.hoisted(() => {
const session: MockGoogleLiveSession = {
close: vi.fn(),
sendClientContent: vi.fn(),
@@ -28,11 +28,17 @@ const { connectMock, session } = vi.hoisted(() => {
sendToolResponse: vi.fn(),
};
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
return { connectMock, session };
const createTokenMock = vi.fn(async (_params: unknown) => ({
name: "auth_tokens/browser-session",
}));
return { connectMock, createTokenMock, session };
});
vi.mock("./google-genai-runtime.js", () => ({
createGoogleGenAI: vi.fn(() => ({
authTokens: {
create: createTokenMock,
},
live: {
connect: connectMock,
},
@@ -50,6 +56,7 @@ function lastConnectParams(): MockGoogleLiveConnectParams {
describe("buildGoogleRealtimeVoiceProvider", () => {
beforeEach(() => {
connectMock.mockClear();
createTokenMock.mockClear();
session.close.mockClear();
session.sendClientContent.mockClear();
session.sendRealtimeInput.mockClear();
@@ -223,6 +230,88 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
expect(lastConnectParams().config).not.toHaveProperty("temperature");
});
it("creates constrained browser sessions for Google Live Talk", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const session = await provider.createBrowserSession?.({
providerConfig: {
apiKey: "gemini-key",
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
},
instructions: "Speak briefly.",
tools: [
{
type: "function",
name: "openclaw_agent_consult",
description: "Ask OpenClaw",
parameters: {
type: "object",
properties: {
question: { type: "string" },
},
required: ["question"],
},
},
],
});
expect(createTokenMock).toHaveBeenCalledTimes(1);
expect(createTokenMock.mock.calls[0]?.[0]).toMatchObject({
config: {
uses: 1,
liveConnectConstraints: {
model: "gemini-live-2.5-flash-preview",
config: {
responseModalities: ["AUDIO"],
temperature: 0.4,
systemInstruction: "Speak briefly.",
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
tools: [
{
functionDeclarations: [
{
name: "openclaw_agent_consult",
behavior: "NON_BLOCKING",
},
],
},
],
},
},
},
});
expect(session).toMatchObject({
provider: "google",
transport: "json-pcm-websocket",
protocol: "google-live-bidi",
clientSecret: "auth_tokens/browser-session",
websocketUrl:
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained",
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: 16000,
outputEncoding: "pcm16",
outputSampleRateHz: 24000,
},
initialMessage: {
setup: {
model: "models/gemini-live-2.5-flash-preview",
generationConfig: {
responseModalities: ["AUDIO"],
},
},
},
});
});
it("waits for setup completion before draining audio and firing ready", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onReady = vi.fn();

View File

@@ -9,6 +9,7 @@ import {
TurnCoverage,
type FunctionDeclaration,
type FunctionResponse,
type LiveConnectConfig,
type LiveServerContent,
type LiveServerMessage,
type LiveServerToolCall,
@@ -19,6 +20,8 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
import type {
RealtimeVoiceAudioFormat,
RealtimeVoiceBridge,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
@@ -40,8 +43,13 @@ const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
const GOOGLE_REALTIME_BROWSER_API_VERSION = "v1alpha";
const GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL =
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
const MAX_PENDING_AUDIO_CHUNKS = 320;
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
type GoogleRealtimeSensitivity = "low" | "high";
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
@@ -66,8 +74,10 @@ type GoogleRealtimeVoiceProviderConfig = {
thinkingBudget?: number;
};
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
type GoogleRealtimeLiveConfig = {
apiKey: string;
instructions?: string;
tools?: RealtimeVoiceTool[];
model?: string;
voice?: string;
temperature?: number;
@@ -84,6 +94,8 @@ type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
thinkingBudget?: number;
};
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & GoogleRealtimeLiveConfig;
type GoogleLiveSession = {
sendClientContent: (params: {
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
@@ -258,7 +270,7 @@ function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCov
}
}
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
function buildThinkingConfig(config: GoogleRealtimeLiveConfig): ThinkingConfig | undefined {
if (config.thinkingLevel) {
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
}
@@ -269,7 +281,7 @@ function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingC
}
function buildRealtimeInputConfig(
config: GoogleRealtimeVoiceBridgeConfig,
config: GoogleRealtimeLiveConfig,
): RealtimeInputConfig | undefined {
const startSensitivity = mapStartSensitivity(config.startSensitivity);
const endSensitivity = mapEndSensitivity(config.endSensitivity);
@@ -310,6 +322,51 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
});
}
function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
const functionDeclarations = buildFunctionDeclarations(config.tools);
return {
responseModalities: [Modality.AUDIO],
...(typeof config.temperature === "number" && config.temperature > 0
? { temperature: config.temperature }
: {}),
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
},
},
},
systemInstruction: config.instructions,
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
...(buildRealtimeInputConfig(config)
? { realtimeInputConfig: buildRealtimeInputConfig(config) }
: {}),
inputAudioTranscription: {},
outputAudioTranscription: {},
...(typeof config.enableAffectiveDialog === "boolean"
? { enableAffectiveDialog: config.enableAffectiveDialog }
: {}),
...(buildThinkingConfig(config) ? { thinkingConfig: buildThinkingConfig(config) } : {}),
};
}
function toGoogleModelResource(model: string): string {
return model.startsWith("models/") ? model : `models/${model}`;
}
function buildBrowserInitialSetup(model: string) {
return {
setup: {
model: toGoogleModelResource(model),
generationConfig: {
responseModalities: [Modality.AUDIO],
},
inputAudioTranscription: {},
outputAudioTranscription: {},
},
};
}
function parsePcmSampleRate(mimeType: string | undefined): number {
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
@@ -366,31 +423,9 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
},
});
const functionDeclarations = buildFunctionDeclarations(this.config.tools);
this.session = (await ai.live.connect({
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
config: {
responseModalities: [Modality.AUDIO],
...(typeof this.config.temperature === "number" && this.config.temperature > 0
? { temperature: this.config.temperature }
: {}),
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
},
},
},
systemInstruction: this.config.instructions,
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
inputAudioTranscription: {},
outputAudioTranscription: {},
...(typeof this.config.enableAffectiveDialog === "boolean"
? { enableAffectiveDialog: this.config.enableAffectiveDialog }
: {}),
...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
},
config: buildGoogleLiveConnectConfig(this.config),
callbacks: {
onopen: () => {
this.connected = true;
@@ -657,14 +692,67 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
});
}
}
}
private get realtimeInputConfig(): RealtimeInputConfig | undefined {
return buildRealtimeInputConfig(this.config);
async function createGoogleRealtimeBrowserSession(
req: RealtimeVoiceBrowserSessionCreateRequest,
): Promise<RealtimeVoiceBrowserSession> {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || resolveEnvApiKey();
if (!apiKey) {
throw new Error("Google Gemini API key missing");
}
private get thinkingConfig(): ThinkingConfig | undefined {
return buildThinkingConfig(this.config);
const model = req.model ?? config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL;
const voice = req.voice ?? config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE;
const expiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS;
const newSessionExpiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS;
const ai = createGoogleGenAI({
apiKey,
httpOptions: {
apiVersion: GOOGLE_REALTIME_BROWSER_API_VERSION,
},
});
const token = await ai.authTokens.create({
config: {
uses: 1,
expireTime: new Date(expiresAtMs).toISOString(),
newSessionExpireTime: new Date(newSessionExpiresAtMs).toISOString(),
liveConnectConstraints: {
model,
config: buildGoogleLiveConnectConfig({
...config,
apiKey,
model,
voice,
instructions: req.instructions,
tools: req.tools,
}),
},
},
});
const clientSecret = token.name?.trim();
if (!clientSecret) {
throw new Error("Google Live browser session did not return an ephemeral token");
}
return {
provider: "google",
transport: "json-pcm-websocket",
protocol: "google-live-bidi",
clientSecret,
websocketUrl: GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL,
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
outputEncoding: "pcm16",
outputSampleRateHz: 24_000,
},
initialMessage: buildBrowserInitialSetup(model),
model,
voice,
expiresAt: Math.floor(expiresAtMs / 1000),
};
}
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
@@ -700,6 +788,7 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
thinkingBudget: config.thinkingBudget,
});
},
createBrowserSession: createGoogleRealtimeBrowserSession,
};
}
@@ -707,5 +796,7 @@ export {
GOOGLE_REALTIME_DEFAULT_API_VERSION,
GOOGLE_REALTIME_DEFAULT_MODEL,
GOOGLE_REALTIME_DEFAULT_VOICE,
GOOGLE_REALTIME_BROWSER_API_VERSION,
GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL,
};
export type { GoogleRealtimeVoiceProviderConfig };