Files
openclaw/extensions/google/realtime-voice-provider.ts
2026-05-06 02:39:15 +01:00

953 lines
31 KiB
TypeScript

import { randomUUID } from "node:crypto";
import type {
ActivityHandling,
Behavior,
EndSensitivity,
FunctionDeclaration,
FunctionResponse,
FunctionResponseScheduling,
LiveConnectConfig,
LiveServerContent,
LiveServerMessage,
LiveServerToolCall,
Modality,
RealtimeInputConfig,
StartSensitivity,
ThinkingConfig,
TurnCoverage,
} from "@google/genai";
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
import type {
RealtimeVoiceAudioFormat,
RealtimeVoiceBridge,
RealtimeVoiceBrowserSession,
RealtimeVoiceBrowserSessionCreateRequest,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
RealtimeVoiceTool,
RealtimeVoiceToolResultOptions,
} from "openclaw/plugin-sdk/realtime-voice";
import {
convertPcmToMulaw8k,
mulawToPcm,
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME,
resamplePcm,
} from "openclaw/plugin-sdk/realtime-voice";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { createGoogleGenAI } from "./google-genai-runtime.js";
const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
const GOOGLE_REALTIME_BROWSER_API_VERSION = "v1alpha";
const GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL =
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
const MAX_PENDING_AUDIO_CHUNKS = 320;
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 500;
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
const GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS = 3;
const GOOGLE_REALTIME_RECONNECT_BASE_DELAY_MS = 250;
const GOOGLE_REALTIME_RECONNECT_MAX_DELAY_MS = 2_000;
const MULAW_LINEAR_SAMPLES = new Int16Array(256);
for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) {
MULAW_LINEAR_SAMPLES[i] = decodeMulawSample(i);
}
type GoogleRealtimeSensitivity = "low" | "high";
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
type GoogleRealtimeActivityHandling = "start-of-activity-interrupts" | "no-interruption";
type GoogleRealtimeTurnCoverage = "only-activity" | "all-input" | "audio-activity-and-all-video";
type GoogleRealtimeVoiceProviderConfig = {
apiKey?: string;
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
activityHandling?: GoogleRealtimeActivityHandling;
turnCoverage?: GoogleRealtimeTurnCoverage;
automaticActivityDetectionDisabled?: boolean;
enableAffectiveDialog?: boolean;
sessionResumption?: boolean;
contextWindowCompression?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleRealtimeLiveConfig = {
apiKey: string;
instructions?: string;
tools?: RealtimeVoiceTool[];
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
activityHandling?: GoogleRealtimeActivityHandling;
turnCoverage?: GoogleRealtimeTurnCoverage;
automaticActivityDetectionDisabled?: boolean;
enableAffectiveDialog?: boolean;
sessionResumption?: boolean;
contextWindowCompression?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & GoogleRealtimeLiveConfig;
type GoogleLiveSession = {
sendClientContent: (params: {
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
turnComplete?: boolean;
}) => void;
sendRealtimeInput: (params: {
audio?: { data: string; mimeType: string };
audioStreamEnd?: boolean;
}) => void;
sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
close: () => void;
};
function trimToUndefined(value: unknown): string | undefined {
return normalizeOptionalString(value);
}
function asFiniteNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "low" || normalized === "high" ? normalized : undefined;
}
function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "minimal" ||
normalized === "low" ||
normalized === "medium" ||
normalized === "high"
? normalized
: undefined;
}
function asActivityHandling(value: unknown): GoogleRealtimeActivityHandling | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
switch (normalized) {
case "start-of-activity-interrupts":
case "start-of-activity-interrupt":
case "interrupt":
case "interrupts":
return "start-of-activity-interrupts";
case "no-interruption":
case "no-interruptions":
case "none":
return "no-interruption";
default:
return undefined;
}
}
function asTurnCoverage(value: unknown): GoogleRealtimeTurnCoverage | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
switch (normalized) {
case "only-activity":
case "turn-includes-only-activity":
return "only-activity";
case "all-input":
case "turn-includes-all-input":
return "all-input";
case "audio-activity-and-all-video":
case "turn-includes-audio-activity-and-all-video":
return "audio-activity-and-all-video";
default:
return undefined;
}
}
function resolveGoogleRealtimeProviderConfigRecord(
config: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers =
typeof config.providers === "object" &&
config.providers !== null &&
!Array.isArray(config.providers)
? (config.providers as Record<string, unknown>)
: undefined;
const nested = providers?.google;
return typeof nested === "object" && nested !== null && !Array.isArray(nested)
? (nested as Record<string, unknown>)
: typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
? (config.google as Record<string, unknown>)
: config;
}
function normalizeProviderConfig(
config: RealtimeVoiceProviderConfig,
cfg?: OpenClawConfig,
): GoogleRealtimeVoiceProviderConfig {
const raw = resolveGoogleRealtimeProviderConfigRecord(config);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: trimToUndefined(raw?.voice),
temperature: asFiniteNumber(raw?.temperature),
apiVersion: trimToUndefined(raw?.apiVersion),
prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
startSensitivity: asSensitivity(raw?.startSensitivity),
endSensitivity: asSensitivity(raw?.endSensitivity),
activityHandling: asActivityHandling(raw?.activityHandling),
turnCoverage: asTurnCoverage(raw?.turnCoverage),
automaticActivityDetectionDisabled: asBoolean(raw?.automaticActivityDetectionDisabled),
enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
sessionResumption: asBoolean(raw?.sessionResumption),
contextWindowCompression: asBoolean(raw?.contextWindowCompression),
thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
};
}
function resolveEnvApiKey(): string | undefined {
return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
}
function mapStartSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): StartSensitivity | undefined {
switch (value) {
case "high":
return "START_SENSITIVITY_HIGH" as StartSensitivity;
case "low":
return "START_SENSITIVITY_LOW" as StartSensitivity;
default:
return undefined;
}
}
function mapEndSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): EndSensitivity | undefined {
switch (value) {
case "high":
return "END_SENSITIVITY_HIGH" as EndSensitivity;
case "low":
return "END_SENSITIVITY_LOW" as EndSensitivity;
default:
return undefined;
}
}
function mapActivityHandling(
value: GoogleRealtimeActivityHandling | undefined,
): ActivityHandling | undefined {
switch (value) {
case "no-interruption":
return "NO_INTERRUPTION" as ActivityHandling;
case "start-of-activity-interrupts":
return "START_OF_ACTIVITY_INTERRUPTS" as ActivityHandling;
default:
return undefined;
}
}
function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCoverage | undefined {
switch (value) {
case "only-activity":
return "TURN_INCLUDES_ONLY_ACTIVITY" as TurnCoverage;
case "all-input":
return "TURN_INCLUDES_ALL_INPUT" as TurnCoverage;
case "audio-activity-and-all-video":
return "TURN_INCLUDES_AUDIO_ACTIVITY_AND_ALL_VIDEO" as TurnCoverage;
default:
return undefined;
}
}
function buildThinkingConfig(config: GoogleRealtimeLiveConfig): ThinkingConfig | undefined {
if (config.thinkingLevel) {
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
}
if (typeof config.thinkingBudget === "number") {
return { thinkingBudget: config.thinkingBudget };
}
return undefined;
}
function buildRealtimeInputConfig(
config: GoogleRealtimeLiveConfig,
): RealtimeInputConfig | undefined {
const startSensitivity = mapStartSensitivity(config.startSensitivity);
const endSensitivity = mapEndSensitivity(config.endSensitivity);
const activityHandling = mapActivityHandling(config.activityHandling);
const turnCoverage = mapTurnCoverage(config.turnCoverage);
const automaticActivityDetection = {
...(typeof config.automaticActivityDetectionDisabled === "boolean"
? { disabled: config.automaticActivityDetectionDisabled }
: {}),
...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
...(typeof config.prefixPaddingMs === "number"
? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
: {}),
...(typeof config.silenceDurationMs === "number"
? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
: {}),
};
const realtimeInputConfig = {
...(Object.keys(automaticActivityDetection).length > 0 ? { automaticActivityDetection } : {}),
...(activityHandling ? { activityHandling } : {}),
...(turnCoverage ? { turnCoverage } : {}),
};
return Object.keys(realtimeInputConfig).length > 0 ? realtimeInputConfig : undefined;
}
function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
return (tools ?? []).map((tool) => {
const declaration: FunctionDeclaration = {
name: tool.name,
description: tool.description,
parametersJsonSchema: tool.parameters,
};
if (tool.name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) {
declaration.behavior = "NON_BLOCKING" as Behavior;
}
return declaration;
});
}
function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
const functionDeclarations = buildFunctionDeclarations(config.tools);
const realtimeInputConfig = buildRealtimeInputConfig(config);
const thinkingConfig = buildThinkingConfig(config);
return {
responseModalities: ["AUDIO" as Modality],
...(typeof config.temperature === "number" && config.temperature > 0
? { temperature: config.temperature }
: {}),
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
},
},
},
systemInstruction: config.instructions,
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
...(realtimeInputConfig ? { realtimeInputConfig } : {}),
inputAudioTranscription: {},
outputAudioTranscription: {},
...(typeof config.enableAffectiveDialog === "boolean"
? { enableAffectiveDialog: config.enableAffectiveDialog }
: {}),
...(thinkingConfig ? { thinkingConfig } : {}),
};
}
function toGoogleModelResource(model: string): string {
return model.startsWith("models/") ? model : `models/${model}`;
}
function buildBrowserInitialSetup(model: string) {
return {
setup: {
model: toGoogleModelResource(model),
generationConfig: {
responseModalities: ["AUDIO" as Modality],
},
inputAudioTranscription: {},
outputAudioTranscription: {},
},
};
}
function parsePcmSampleRate(mimeType: string | undefined): number {
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
}
function isMulawSilence(audio: Buffer): boolean {
return audio.length > 0 && audio.every((sample) => sample === 0xff);
}
function isPcm16Silence(audio: Buffer): boolean {
const samples = Math.floor(audio.length / 2);
if (samples === 0) {
return false;
}
for (let i = 0; i < samples; i += 1) {
if (audio.readInt16LE(i * 2) !== 0) {
return false;
}
}
return true;
}
function formatGoogleLiveCloseEvent(
event:
| {
code?: number;
reason?: string;
wasClean?: boolean;
}
| undefined,
): string {
if (!event) {
return "code=unknown reason=unknown";
}
const code = typeof event.code === "number" ? event.code : "unknown";
const reason = event.reason?.trim() || "none";
const clean = typeof event.wasClean === "boolean" ? ` clean=${event.wasClean}` : "";
return `code=${code} reason=${reason}${clean}`;
}
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
readonly supportsToolResultContinuation = true;
private session: GoogleLiveSession | null = null;
private connected = false;
private sessionConfigured = false;
private intentionallyClosed = false;
private pendingAudio: Buffer[] = [];
private sessionReadyFired = false;
private consecutiveSilenceMs = 0;
private audioStreamEnded = false;
private pendingFunctionNames = new Map<string, string>();
private readonly audioFormat: RealtimeVoiceAudioFormat;
private resumptionHandle: string | undefined;
private reconnectAttempts = 0;
private reconnectTimer: ReturnType<typeof setTimeout> | undefined;
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
}
async connect(): Promise<void> {
this.intentionallyClosed = false;
this.sessionConfigured = false;
this.sessionReadyFired = false;
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
this.pendingFunctionNames.clear();
const ai = createGoogleGenAI({
apiKey: this.config.apiKey,
httpOptions: {
apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
},
});
this.session = (await ai.live.connect({
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
config: {
...buildGoogleLiveConnectConfig(this.config),
...(this.config.sessionResumption === false
? {}
: {
sessionResumption: this.resumptionHandle ? { handle: this.resumptionHandle } : {},
}),
...(this.config.contextWindowCompression === false
? {}
: { contextWindowCompression: { slidingWindow: {} } }),
},
callbacks: {
onopen: () => {
this.connected = true;
},
onmessage: (message) => {
this.handleMessage(message);
},
onerror: (event) => {
const error =
event.error instanceof Error
? event.error
: new Error(
typeof event.message === "string" ? event.message : "Google Live API error",
);
this.config.onError?.(error);
},
onclose: (event) => {
this.connected = false;
this.sessionConfigured = false;
this.pendingFunctionNames.clear();
this.session = null;
if (this.intentionallyClosed) {
this.config.onClose?.("completed");
return;
}
const closeDetails = formatGoogleLiveCloseEvent(event);
if (this.scheduleReconnect(closeDetails)) {
return;
}
this.config.onError?.(
new Error(`Google Live session closed after reconnect attempts: ${closeDetails}`),
);
this.config.onClose?.("error");
},
},
})) as GoogleLiveSession;
}
sendAudio(audio: Buffer): void {
if (!this.session || !this.connected || !this.sessionConfigured) {
if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
this.pendingAudio.push(audio);
}
return;
}
const silent = this.isSilence(audio);
if (silent && this.audioStreamEnded) {
return;
}
if (!silent) {
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
}
const pcm16k = this.toGoogleInputPcm16k(audio);
this.session.sendRealtimeInput({
audio: {
data: pcm16k.toString("base64"),
mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
},
});
if (!silent) {
return;
}
const silenceThresholdMs =
typeof this.config.silenceDurationMs === "number"
? Math.max(0, Math.floor(this.config.silenceDurationMs))
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
const bytesPerSample = this.audioFormat.encoding === "pcm16" ? 2 : 1;
this.consecutiveSilenceMs += Math.round(
(audio.length / bytesPerSample / this.audioFormat.sampleRateHz) * 1000,
);
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
this.session.sendRealtimeInput({ audioStreamEnd: true });
this.audioStreamEnded = true;
}
}
setMediaTimestamp(_ts: number): void {}
sendUserMessage(text: string): void {
const normalized = text.trim();
if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
return;
}
this.session.sendClientContent({
turns: [{ role: "user", parts: [{ text: normalized }] }],
turnComplete: true,
});
}
triggerGreeting(instructions?: string): void {
const greetingPrompt =
instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
this.sendUserMessage(greetingPrompt);
}
submitToolResult(
callId: string,
result: unknown,
options?: RealtimeVoiceToolResultOptions,
): void {
if (!this.session) {
return;
}
const name = this.pendingFunctionNames.get(callId);
if (!name) {
this.config.onError?.(
new Error(
`Google Live function response is missing a matching function call for ${callId}`,
),
);
return;
}
try {
const isConsultTool = name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME;
const functionResponse: FunctionResponse = {
id: callId,
name,
response:
result && typeof result === "object" && !Array.isArray(result)
? (result as Record<string, unknown>)
: { output: result },
};
if (isConsultTool) {
functionResponse.scheduling = "WHEN_IDLE" as FunctionResponseScheduling;
if (options?.willContinue === true) {
functionResponse.willContinue = true;
}
} else if (options?.willContinue === true) {
this.config.onError?.(
new Error(
`Google Live continuation is only supported for ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME}`,
),
);
return;
}
this.session.sendToolResponse({
functionResponses: [functionResponse],
});
if (options?.willContinue !== true) {
this.pendingFunctionNames.delete(callId);
}
} catch (error) {
this.config.onError?.(
error instanceof Error ? error : new Error("Failed to send Google Live function response"),
);
}
}
acknowledgeMark(): void {}
close(): void {
this.intentionallyClosed = true;
this.connected = false;
this.sessionConfigured = false;
if (this.reconnectTimer) {
clearTimeout(this.reconnectTimer);
this.reconnectTimer = undefined;
}
this.pendingAudio = [];
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
this.pendingFunctionNames.clear();
const session = this.session;
this.session = null;
session?.close();
}
isConnected(): boolean {
return this.connected && this.sessionConfigured;
}
private isSilence(audio: Buffer): boolean {
return this.audioFormat.encoding === "pcm16" ? isPcm16Silence(audio) : isMulawSilence(audio);
}
private toInputPcm(audio: Buffer): Buffer {
return this.audioFormat.encoding === "pcm16" ? audio : mulawToPcm(audio);
}
private toGoogleInputPcm16k(audio: Buffer): Buffer {
if (
this.audioFormat.encoding === "g711_ulaw" &&
this.audioFormat.sampleRateHz === 8_000 &&
GOOGLE_REALTIME_INPUT_SAMPLE_RATE === 16_000
) {
return convertMulaw8kToPcm16k(audio);
}
return resamplePcm(
this.toInputPcm(audio),
this.audioFormat.sampleRateHz,
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
);
}
private toOutputAudio(pcm: Buffer, sampleRate: number): Buffer {
return this.audioFormat.encoding === "pcm16"
? resamplePcm(pcm, sampleRate, this.audioFormat.sampleRateHz)
: convertPcmToMulaw8k(pcm, sampleRate);
}
private handleMessage(message: LiveServerMessage): void {
this.captureSessionLifecycle(message);
if (message.setupComplete) {
this.handleSetupComplete();
}
if (message.serverContent) {
this.handleServerContent(message.serverContent);
}
if (message.toolCall) {
this.handleToolCall(message.toolCall);
}
}
private captureSessionLifecycle(message: LiveServerMessage): void {
const raw = message as unknown as {
goAway?: { timeLeft?: string };
sessionResumptionUpdate?: { newHandle?: string; resumable?: boolean };
};
const update = raw.sessionResumptionUpdate;
if (update?.resumable && update.newHandle) {
this.resumptionHandle = update.newHandle;
}
if (raw.goAway?.timeLeft) {
this.config.onError?.(new Error(`Google Live session goAway: ${raw.goAway.timeLeft}`));
}
}
private handleSetupComplete(): void {
this.sessionConfigured = true;
this.reconnectAttempts = 0;
for (const chunk of this.pendingAudio.splice(0)) {
this.sendAudio(chunk);
}
if (!this.sessionReadyFired) {
this.sessionReadyFired = true;
this.config.onReady?.();
}
}
private handleServerContent(content: LiveServerContent): void {
if (content.interrupted) {
this.config.onClearAudio();
}
if (content.inputTranscription?.text) {
this.config.onTranscript?.(
"user",
content.inputTranscription.text,
content.inputTranscription.finished ?? false,
);
}
if (content.outputTranscription?.text) {
this.config.onTranscript?.(
"assistant",
content.outputTranscription.text,
content.outputTranscription.finished ?? false,
);
}
let emittedAssistantText = false;
for (const part of content.modelTurn?.parts ?? []) {
if (part.inlineData?.data) {
const pcm = Buffer.from(part.inlineData.data, "base64");
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
const audio = this.toOutputAudio(pcm, sampleRate);
if (audio.length > 0) {
this.config.onAudio(audio);
this.config.onMark?.(`audio-${randomUUID()}`);
}
continue;
}
if (part.thought) {
continue;
}
if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
emittedAssistantText = true;
this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
}
}
if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
return;
}
}
private handleToolCall(toolCall: LiveServerToolCall): void {
for (const call of toolCall.functionCalls ?? []) {
const name = call.name?.trim();
if (!name) {
continue;
}
const callId = call.id?.trim() || `google-live-${randomUUID()}`;
this.pendingFunctionNames.set(callId, name);
this.config.onToolCall?.({
itemId: callId,
callId,
name,
args: call.args ?? {},
});
}
}
private scheduleReconnect(closeDetails: string): boolean {
if (this.reconnectAttempts >= GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS) {
return false;
}
const attempt = ++this.reconnectAttempts;
const delayMs = Math.min(
GOOGLE_REALTIME_RECONNECT_MAX_DELAY_MS,
GOOGLE_REALTIME_RECONNECT_BASE_DELAY_MS * 2 ** (attempt - 1),
);
this.config.onError?.(
new Error(
`Google Live session closed unexpectedly (${closeDetails}); reconnecting ${attempt}/${GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS} in ${delayMs}ms`,
),
);
this.reconnectTimer = setTimeout(() => {
this.reconnectTimer = undefined;
if (this.intentionallyClosed) {
return;
}
this.connect().catch((error: unknown) => {
const message = error instanceof Error ? error.message : String(error);
this.config.onError?.(error instanceof Error ? error : new Error(message));
if (!this.scheduleReconnect(`connect failed: ${message}`)) {
this.config.onClose?.("error");
}
});
}, delayMs);
return true;
}
}
function convertMulaw8kToPcm16k(muLaw: Buffer): Buffer {
if (muLaw.length === 0) {
return Buffer.alloc(0);
}
const pcm = Buffer.alloc(muLaw.length * 4);
for (let i = 0; i < muLaw.length; i += 1) {
const current = MULAW_LINEAR_SAMPLES[muLaw[i] ?? 0] ?? 0;
const next = MULAW_LINEAR_SAMPLES[muLaw[i + 1] ?? muLaw[i] ?? 0] ?? current;
pcm.writeInt16LE(current, i * 4);
pcm.writeInt16LE(Math.round((current + next) / 2), i * 4 + 2);
}
return pcm;
}
function decodeMulawSample(value: number): number {
const muLaw = ~value & 0xff;
const sign = muLaw & 0x80;
const exponent = (muLaw >> 4) & 0x07;
const mantissa = muLaw & 0x0f;
let sample = ((mantissa << 3) + 132) << exponent;
sample -= 132;
return sign ? -sample : sample;
}
async function createGoogleRealtimeBrowserSession(
req: RealtimeVoiceBrowserSessionCreateRequest,
): Promise<RealtimeVoiceBrowserSession> {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || resolveEnvApiKey();
if (!apiKey) {
throw new Error("Google Gemini API key missing");
}
const model = req.model ?? config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL;
const voice = req.voice ?? config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE;
const expiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS;
const newSessionExpiresAtMs = Date.now() + GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS;
const ai = createGoogleGenAI({
apiKey,
httpOptions: {
apiVersion: GOOGLE_REALTIME_BROWSER_API_VERSION,
},
});
const token = await ai.authTokens.create({
config: {
uses: 1,
expireTime: new Date(expiresAtMs).toISOString(),
newSessionExpireTime: new Date(newSessionExpiresAtMs).toISOString(),
liveConnectConstraints: {
model,
config: buildGoogleLiveConnectConfig({
...config,
apiKey,
model,
voice,
instructions: req.instructions,
tools: req.tools,
}),
},
},
});
const clientSecret = token.name?.trim();
if (!clientSecret) {
throw new Error("Google Live browser session did not return an ephemeral token");
}
return {
provider: "google",
transport: "provider-websocket",
protocol: "google-live-bidi",
clientSecret,
websocketUrl: GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL,
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
outputEncoding: "pcm16",
outputSampleRateHz: 24_000,
},
initialMessage: buildBrowserInitialSetup(model),
model,
voice,
expiresAt: Math.floor(expiresAtMs / 1000),
};
}
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
return {
id: "google",
label: "Google Live Voice",
defaultModel: GOOGLE_REALTIME_DEFAULT_MODEL,
autoSelectOrder: 20,
capabilities: {
transports: ["provider-websocket", "gateway-relay"],
inputAudioFormats: [
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
],
outputAudioFormats: [
REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ,
REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
],
supportsBrowserSession: true,
supportsBargeIn: true,
supportsToolCalls: true,
supportsVideoFrames: true,
supportsSessionResumption: true,
},
resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
isConfigured: ({ providerConfig }) =>
Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
createBridge: (req) => {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || resolveEnvApiKey();
if (!apiKey) {
throw new Error("Google Gemini API key missing");
}
return new GoogleRealtimeVoiceBridge({
...req,
apiKey,
model: config.model,
voice: config.voice,
temperature: config.temperature,
apiVersion: config.apiVersion,
prefixPaddingMs: config.prefixPaddingMs,
silenceDurationMs: config.silenceDurationMs,
startSensitivity: config.startSensitivity,
endSensitivity: config.endSensitivity,
activityHandling: config.activityHandling,
turnCoverage: config.turnCoverage,
automaticActivityDetectionDisabled: config.automaticActivityDetectionDisabled,
enableAffectiveDialog: config.enableAffectiveDialog,
sessionResumption: config.sessionResumption,
contextWindowCompression: config.contextWindowCompression,
thinkingLevel: config.thinkingLevel,
thinkingBudget: config.thinkingBudget,
});
},
createBrowserSession: createGoogleRealtimeBrowserSession,
};
}