mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 19:40:42 +00:00
feat: add browser realtime talk transports
This commit is contained in:
@@ -20,7 +20,7 @@ type MockGoogleLiveConnectParams = {
|
||||
};
|
||||
};
|
||||
|
||||
const { connectMock, session } = vi.hoisted(() => {
|
||||
const { connectMock, createTokenMock, session } = vi.hoisted(() => {
|
||||
const session: MockGoogleLiveSession = {
|
||||
close: vi.fn(),
|
||||
sendClientContent: vi.fn(),
|
||||
@@ -28,11 +28,17 @@ const { connectMock, session } = vi.hoisted(() => {
|
||||
sendToolResponse: vi.fn(),
|
||||
};
|
||||
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
|
||||
return { connectMock, session };
|
||||
const createTokenMock = vi.fn(async (_params: unknown) => ({
|
||||
name: "auth_tokens/browser-session",
|
||||
}));
|
||||
return { connectMock, createTokenMock, session };
|
||||
});
|
||||
|
||||
vi.mock("./google-genai-runtime.js", () => ({
|
||||
createGoogleGenAI: vi.fn(() => ({
|
||||
authTokens: {
|
||||
create: createTokenMock,
|
||||
},
|
||||
live: {
|
||||
connect: connectMock,
|
||||
},
|
||||
@@ -50,6 +56,7 @@ function lastConnectParams(): MockGoogleLiveConnectParams {
|
||||
describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
beforeEach(() => {
|
||||
connectMock.mockClear();
|
||||
createTokenMock.mockClear();
|
||||
session.close.mockClear();
|
||||
session.sendClientContent.mockClear();
|
||||
session.sendRealtimeInput.mockClear();
|
||||
@@ -223,6 +230,88 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(lastConnectParams().config).not.toHaveProperty("temperature");
|
||||
});
|
||||
|
||||
it("creates constrained browser sessions for Google Live Talk", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
|
||||
const session = await provider.createBrowserSession?.({
|
||||
providerConfig: {
|
||||
apiKey: "gemini-key",
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
voice: "Puck",
|
||||
temperature: 0.4,
|
||||
},
|
||||
instructions: "Speak briefly.",
|
||||
tools: [
|
||||
{
|
||||
type: "function",
|
||||
name: "openclaw_agent_consult",
|
||||
description: "Ask OpenClaw",
|
||||
parameters: {
|
||||
type: "object",
|
||||
properties: {
|
||||
question: { type: "string" },
|
||||
},
|
||||
required: ["question"],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(createTokenMock).toHaveBeenCalledTimes(1);
|
||||
expect(createTokenMock.mock.calls[0]?.[0]).toMatchObject({
|
||||
config: {
|
||||
uses: 1,
|
||||
liveConnectConstraints: {
|
||||
model: "gemini-live-2.5-flash-preview",
|
||||
config: {
|
||||
responseModalities: ["AUDIO"],
|
||||
temperature: 0.4,
|
||||
systemInstruction: "Speak briefly.",
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: "Puck",
|
||||
},
|
||||
},
|
||||
},
|
||||
tools: [
|
||||
{
|
||||
functionDeclarations: [
|
||||
{
|
||||
name: "openclaw_agent_consult",
|
||||
behavior: "NON_BLOCKING",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
expect(session).toMatchObject({
|
||||
provider: "google",
|
||||
transport: "json-pcm-websocket",
|
||||
protocol: "google-live-bidi",
|
||||
clientSecret: "auth_tokens/browser-session",
|
||||
websocketUrl:
|
||||
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained",
|
||||
audio: {
|
||||
inputEncoding: "pcm16",
|
||||
inputSampleRateHz: 16000,
|
||||
outputEncoding: "pcm16",
|
||||
outputSampleRateHz: 24000,
|
||||
},
|
||||
initialMessage: {
|
||||
setup: {
|
||||
model: "models/gemini-live-2.5-flash-preview",
|
||||
generationConfig: {
|
||||
responseModalities: ["AUDIO"],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("waits for setup completion before draining audio and firing ready", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onReady = vi.fn();
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
TurnCoverage,
|
||||
type FunctionDeclaration,
|
||||
type FunctionResponse,
|
||||
type LiveConnectConfig,
|
||||
type LiveServerContent,
|
||||
type LiveServerMessage,
|
||||
type LiveServerToolCall,
|
||||
@@ -19,6 +20,8 @@ import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
|
||||
import type {
|
||||
RealtimeVoiceAudioFormat,
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBrowserSession,
|
||||
RealtimeVoiceBrowserSessionCreateRequest,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
@@ -40,8 +43,13 @@ const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-
|
||||
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
|
||||
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
|
||||
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
|
||||
const GOOGLE_REALTIME_BROWSER_API_VERSION = "v1alpha";
|
||||
const GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL =
|
||||
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
|
||||
const MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
|
||||
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
|
||||
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
|
||||
|
||||
type GoogleRealtimeSensitivity = "low" | "high";
|
||||
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
|
||||
@@ -66,8 +74,10 @@ type GoogleRealtimeVoiceProviderConfig = {
|
||||
thinkingBudget?: number;
|
||||
};
|
||||
|
||||
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
type GoogleRealtimeLiveConfig = {
|
||||
apiKey: string;
|
||||
instructions?: string;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
model?: string;
|
||||
voice?: string;
|
||||
temperature?: number;
|
||||
@@ -84,6 +94,8 @@ type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
thinkingBudget?: number;
|
||||
};
|
||||
|
||||
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & GoogleRealtimeLiveConfig;
|
||||
|
||||
type GoogleLiveSession = {
|
||||
sendClientContent: (params: {
|
||||
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
|
||||
@@ -258,7 +270,7 @@ function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCov
|
||||
}
|
||||
}
|
||||
|
||||
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
|
||||
function buildThinkingConfig(config: GoogleRealtimeLiveConfig): ThinkingConfig | undefined {
|
||||
if (config.thinkingLevel) {
|
||||
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
|
||||
}
|
||||
@@ -269,7 +281,7 @@ function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingC
|
||||
}
|
||||
|
||||
function buildRealtimeInputConfig(
|
||||
config: GoogleRealtimeVoiceBridgeConfig,
|
||||
config: GoogleRealtimeLiveConfig,
|
||||
): RealtimeInputConfig | undefined {
|
||||
const startSensitivity = mapStartSensitivity(config.startSensitivity);
|
||||
const endSensitivity = mapEndSensitivity(config.endSensitivity);
|
||||
@@ -310,6 +322,51 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
|
||||
});
|
||||
}
|
||||
|
||||
function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
|
||||
const functionDeclarations = buildFunctionDeclarations(config.tools);
|
||||
return {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
...(typeof config.temperature === "number" && config.temperature > 0
|
||||
? { temperature: config.temperature }
|
||||
: {}),
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
|
||||
},
|
||||
},
|
||||
},
|
||||
systemInstruction: config.instructions,
|
||||
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
|
||||
...(buildRealtimeInputConfig(config)
|
||||
? { realtimeInputConfig: buildRealtimeInputConfig(config) }
|
||||
: {}),
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
...(typeof config.enableAffectiveDialog === "boolean"
|
||||
? { enableAffectiveDialog: config.enableAffectiveDialog }
|
||||
: {}),
|
||||
...(buildThinkingConfig(config) ? { thinkingConfig: buildThinkingConfig(config) } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function toGoogleModelResource(model: string): string {
|
||||
return model.startsWith("models/") ? model : `models/${model}`;
|
||||
}
|
||||
|
||||
function buildBrowserInitialSetup(model: string) {
|
||||
return {
|
||||
setup: {
|
||||
model: toGoogleModelResource(model),
|
||||
generationConfig: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
},
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function parsePcmSampleRate(mimeType: string | undefined): number {
|
||||
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
|
||||
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
|
||||
@@ -366,31 +423,9 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
},
|
||||
});
|
||||
|
||||
const functionDeclarations = buildFunctionDeclarations(this.config.tools);
|
||||
this.session = (await ai.live.connect({
|
||||
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
|
||||
config: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
...(typeof this.config.temperature === "number" && this.config.temperature > 0
|
||||
? { temperature: this.config.temperature }
|
||||
: {}),
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
|
||||
},
|
||||
},
|
||||
},
|
||||
systemInstruction: this.config.instructions,
|
||||
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
|
||||
...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
...(typeof this.config.enableAffectiveDialog === "boolean"
|
||||
? { enableAffectiveDialog: this.config.enableAffectiveDialog }
|
||||
: {}),
|
||||
...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
|
||||
},
|
||||
config: buildGoogleLiveConnectConfig(this.config),
|
||||
callbacks: {
|
||||
onopen: () => {
|
||||
this.connected = true;
|
||||
@@ -657,14 +692,67 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private get realtimeInputConfig(): RealtimeInputConfig | undefined {
|
||||
return buildRealtimeInputConfig(this.config);
|
||||
async function createGoogleRealtimeBrowserSession(
|
||||
req: RealtimeVoiceBrowserSessionCreateRequest,
|
||||
): Promise<RealtimeVoiceBrowserSession> {
|
||||
const config = normalizeProviderConfig(req.providerConfig);
|
||||
const apiKey = config.apiKey || resolveEnvApiKey();
|
||||
if (!apiKey) {
|
||||
throw new Error("Google Gemini API key missing");
|
||||
}
|
||||
|
||||
private get thinkingConfig(): ThinkingConfig | undefined {
|
||||
return buildThinkingConfig(this.config);
|
||||
const model = req.model ?? config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL;
|
||||
const voice = req.voice ?? config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE;
|
||||
const expiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS;
|
||||
const newSessionExpiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS;
|
||||
const ai = createGoogleGenAI({
|
||||
apiKey,
|
||||
httpOptions: {
|
||||
apiVersion: GOOGLE_REALTIME_BROWSER_API_VERSION,
|
||||
},
|
||||
});
|
||||
const token = await ai.authTokens.create({
|
||||
config: {
|
||||
uses: 1,
|
||||
expireTime: new Date(expiresAtMs).toISOString(),
|
||||
newSessionExpireTime: new Date(newSessionExpiresAtMs).toISOString(),
|
||||
liveConnectConstraints: {
|
||||
model,
|
||||
config: buildGoogleLiveConnectConfig({
|
||||
...config,
|
||||
apiKey,
|
||||
model,
|
||||
voice,
|
||||
instructions: req.instructions,
|
||||
tools: req.tools,
|
||||
}),
|
||||
},
|
||||
},
|
||||
});
|
||||
const clientSecret = token.name?.trim();
|
||||
if (!clientSecret) {
|
||||
throw new Error("Google Live browser session did not return an ephemeral token");
|
||||
}
|
||||
|
||||
return {
|
||||
provider: "google",
|
||||
transport: "json-pcm-websocket",
|
||||
protocol: "google-live-bidi",
|
||||
clientSecret,
|
||||
websocketUrl: GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL,
|
||||
audio: {
|
||||
inputEncoding: "pcm16",
|
||||
inputSampleRateHz: GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
|
||||
outputEncoding: "pcm16",
|
||||
outputSampleRateHz: 24_000,
|
||||
},
|
||||
initialMessage: buildBrowserInitialSetup(model),
|
||||
model,
|
||||
voice,
|
||||
expiresAt: Math.floor(expiresAtMs / 1000),
|
||||
};
|
||||
}
|
||||
|
||||
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
|
||||
@@ -700,6 +788,7 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
|
||||
thinkingBudget: config.thinkingBudget,
|
||||
});
|
||||
},
|
||||
createBrowserSession: createGoogleRealtimeBrowserSession,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -707,5 +796,7 @@ export {
|
||||
GOOGLE_REALTIME_DEFAULT_API_VERSION,
|
||||
GOOGLE_REALTIME_DEFAULT_MODEL,
|
||||
GOOGLE_REALTIME_DEFAULT_VOICE,
|
||||
GOOGLE_REALTIME_BROWSER_API_VERSION,
|
||||
GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL,
|
||||
};
|
||||
export type { GoogleRealtimeVoiceProviderConfig };
|
||||
|
||||
Reference in New Issue
Block a user