feat(google): add realtime voice provider

This commit is contained in:
Peter Steinberger
2026-04-24 09:08:09 +01:00
parent c138368040
commit b5e5f2cede
13 changed files with 1127 additions and 141 deletions

View File

@@ -11,6 +11,7 @@ import {
} from "./generation-provider-metadata.js";
import { geminiMemoryEmbeddingProviderAdapter } from "./memory-embedding-adapter.js";
import { registerGoogleProvider } from "./provider-registration.js";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
import { buildGoogleSpeechProvider } from "./speech-provider.js";
import { createGeminiWebSearchProvider } from "./src/gemini-web-search-provider.js";
@@ -156,6 +157,7 @@ export default definePluginEntry({
api.registerImageGenerationProvider(createLazyGoogleImageGenerationProvider());
api.registerMediaUnderstandingProvider(createLazyGoogleMediaUnderstandingProvider());
api.registerMusicGenerationProvider(createLazyGoogleMusicGenerationProvider());
api.registerRealtimeVoiceProvider(buildGoogleRealtimeVoiceProvider());
api.registerSpeechProvider(buildGoogleSpeechProvider());
api.registerVideoGenerationProvider(createLazyGoogleVideoGenerationProvider());
api.registerWebSearchProvider(createGeminiWebSearchProvider());

View File

@@ -49,6 +49,7 @@
"memoryEmbeddingProviders": ["gemini"],
"imageGenerationProviders": ["google"],
"musicGenerationProviders": ["google"],
"realtimeVoiceProviders": ["google"],
"speechProviders": ["google"],
"videoGenerationProviders": ["google"],
"webSearchProviders": ["gemini"]

View File

@@ -0,0 +1,354 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
type MockGoogleLiveSession = {
close: ReturnType<typeof vi.fn>;
sendClientContent: ReturnType<typeof vi.fn>;
sendRealtimeInput: ReturnType<typeof vi.fn>;
sendToolResponse: ReturnType<typeof vi.fn>;
};
type MockGoogleLiveConnectParams = {
model: string;
config: Record<string, unknown>;
callbacks: {
onopen: () => void;
onmessage: (message: Record<string, unknown>) => void;
onerror: (event: { error?: unknown; message?: string }) => void;
onclose: () => void;
};
};
const { connectMock, session } = vi.hoisted(() => {
const session: MockGoogleLiveSession = {
close: vi.fn(),
sendClientContent: vi.fn(),
sendRealtimeInput: vi.fn(),
sendToolResponse: vi.fn(),
};
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
return { connectMock, session };
});
vi.mock("./google-genai-runtime.js", () => ({
createGoogleGenAI: vi.fn(() => ({
live: {
connect: connectMock,
},
})),
}));
function lastConnectParams(): MockGoogleLiveConnectParams {
const params = connectMock.mock.calls.at(-1)?.[0];
if (!params) {
throw new Error("expected google live connect call");
}
return params;
}
describe("buildGoogleRealtimeVoiceProvider", () => {
beforeEach(() => {
connectMock.mockClear();
session.close.mockClear();
session.sendClientContent.mockClear();
session.sendRealtimeInput.mockClear();
session.sendToolResponse.mockClear();
delete process.env.GEMINI_API_KEY;
delete process.env.GOOGLE_API_KEY;
});
it("normalizes provider config and cfg model-provider key fallback", () => {
const provider = buildGoogleRealtimeVoiceProvider();
const resolved = provider.resolveConfig?.({
cfg: {
models: {
providers: {
google: {
apiKey: "cfg-key",
},
},
},
} as never,
rawConfig: {
providers: {
google: {
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
silenceDurationMs: 700,
startSensitivity: "high",
},
},
},
});
expect(resolved).toEqual({
apiKey: "cfg-key",
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
apiVersion: undefined,
prefixPaddingMs: undefined,
silenceDurationMs: 700,
startSensitivity: "high",
endSensitivity: undefined,
enableAffectiveDialog: undefined,
thinkingLevel: undefined,
thinkingBudget: undefined,
});
});
it("connects with Google Live setup config and tool declarations", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
model: "gemini-live-2.5-flash-preview",
voice: "Kore",
temperature: 0.3,
startSensitivity: "low",
},
instructions: "Speak briefly.",
tools: [
{
type: "function",
name: "lookup",
description: "Look something up",
parameters: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
],
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(connectMock).toHaveBeenCalledTimes(1);
expect(lastConnectParams()).toMatchObject({
model: "gemini-live-2.5-flash-preview",
config: {
responseModalities: ["AUDIO"],
temperature: 0.3,
systemInstruction: "Speak briefly.",
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Kore",
},
},
},
outputAudioTranscription: {},
tools: [
{
functionDeclarations: [
{
name: "lookup",
description: "Look something up",
parametersJsonSchema: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
],
},
],
},
});
});
it("omits zero temperature for native audio responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
temperature: 0,
},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(lastConnectParams().config).not.toHaveProperty("temperature");
});
it("waits for setup completion before draining audio and firing ready", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onReady = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onReady,
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
bridge.sendAudio(Buffer.from([0xff, 0xff]));
expect(session.sendRealtimeInput).not.toHaveBeenCalled();
expect(onReady).not.toHaveBeenCalled();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
expect(onReady).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
data: expect.any(String),
mimeType: "audio/pcm;rate=16000",
});
});
it("marks the Google audio stream complete after sustained telephony silence", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
const silence20ms = Buffer.alloc(160, 0xff);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
session.sendRealtimeInput.mockClear();
bridge.sendAudio(Buffer.alloc(160, 0x7f));
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
});
it("sends text prompts as ordered client turns", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
bridge.sendUserMessage?.(" Say hello. ");
expect(session.sendClientContent).toHaveBeenCalledWith({
turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
turnComplete: true,
});
});
it("converts Google PCM output to mu-law audio", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onAudio = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio,
onClearAudio: vi.fn(),
});
const pcm24k = Buffer.alloc(480);
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
serverContent: {
modelTurn: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm24k.toString("base64"),
},
},
],
},
},
});
expect(onAudio).toHaveBeenCalledTimes(1);
expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
});
it("does not forward Google thought text as assistant transcript", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onTranscript = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onTranscript,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: {},
serverContent: {
modelTurn: {
parts: [{ text: "internal reasoning", thought: true }],
},
},
});
expect(onTranscript).not.toHaveBeenCalled();
});
it("forwards Live API tool calls and submits matching function responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onToolCall = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onToolCall,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
toolCall: {
functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
},
});
expect(onToolCall).toHaveBeenCalledWith({
itemId: "call-1",
callId: "call-1",
name: "lookup",
args: { query: "hi" },
});
bridge.submitToolResult("call-1", { result: "ok" });
expect(session.sendToolResponse).toHaveBeenCalledWith({
functionResponses: [
{
id: "call-1",
response: { result: "ok" },
},
],
});
});
});

View File

@@ -0,0 +1,535 @@
import { randomUUID } from "node:crypto";
import {
EndSensitivity,
Modality,
StartSensitivity,
type FunctionDeclaration,
type FunctionResponse,
type LiveServerContent,
type LiveServerMessage,
type LiveServerToolCall,
type RealtimeInputConfig,
type ThinkingConfig,
} from "@google/genai";
import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
import type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { convertPcmToMulaw8k, mulawToPcm, resamplePcm } from "openclaw/plugin-sdk/realtime-voice";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import { normalizeOptionalString } from "openclaw/plugin-sdk/text-runtime";
import { createGoogleGenAI } from "./google-genai-runtime.js";
const GOOGLE_REALTIME_DEFAULT_MODEL = "gemini-2.5-flash-native-audio-preview-12-2025";
const GOOGLE_REALTIME_DEFAULT_VOICE = "Kore";
const GOOGLE_REALTIME_DEFAULT_API_VERSION = "v1beta";
const GOOGLE_REALTIME_INPUT_SAMPLE_RATE = 16_000;
const TELEPHONY_SAMPLE_RATE = 8000;
const MAX_PENDING_AUDIO_CHUNKS = 320;
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
type GoogleRealtimeSensitivity = "low" | "high";
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
type GoogleRealtimeVoiceProviderConfig = {
apiKey?: string;
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
enableAffectiveDialog?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
apiKey: string;
model?: string;
voice?: string;
temperature?: number;
apiVersion?: string;
prefixPaddingMs?: number;
silenceDurationMs?: number;
startSensitivity?: GoogleRealtimeSensitivity;
endSensitivity?: GoogleRealtimeSensitivity;
enableAffectiveDialog?: boolean;
thinkingLevel?: GoogleRealtimeThinkingLevel;
thinkingBudget?: number;
};
type GoogleLiveSession = {
sendClientContent: (params: {
turns?: Array<{ role: string; parts: Array<{ text: string }> }>;
turnComplete?: boolean;
}) => void;
sendRealtimeInput: (params: {
audio?: { data: string; mimeType: string };
audioStreamEnd?: boolean;
}) => void;
sendToolResponse: (params: { functionResponses: FunctionResponse[] | FunctionResponse }) => void;
close: () => void;
};
function trimToUndefined(value: unknown): string | undefined {
return normalizeOptionalString(value);
}
function asFiniteNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asSensitivity(value: unknown): GoogleRealtimeSensitivity | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "low" || normalized === "high" ? normalized : undefined;
}
function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefined {
const normalized = normalizeOptionalString(value)?.toLowerCase();
return normalized === "minimal" ||
normalized === "low" ||
normalized === "medium" ||
normalized === "high"
? normalized
: undefined;
}
function resolveGoogleRealtimeProviderConfigRecord(
config: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers =
typeof config.providers === "object" &&
config.providers !== null &&
!Array.isArray(config.providers)
? (config.providers as Record<string, unknown>)
: undefined;
const nested = providers?.google;
return typeof nested === "object" && nested !== null && !Array.isArray(nested)
? (nested as Record<string, unknown>)
: typeof config.google === "object" && config.google !== null && !Array.isArray(config.google)
? (config.google as Record<string, unknown>)
: config;
}
function normalizeProviderConfig(
config: RealtimeVoiceProviderConfig,
cfg?: OpenClawConfig,
): GoogleRealtimeVoiceProviderConfig {
const raw = resolveGoogleRealtimeProviderConfigRecord(config);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey ?? cfg?.models?.providers?.google?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.google.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: trimToUndefined(raw?.voice),
temperature: asFiniteNumber(raw?.temperature),
apiVersion: trimToUndefined(raw?.apiVersion),
prefixPaddingMs: asFiniteNumber(raw?.prefixPaddingMs),
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
startSensitivity: asSensitivity(raw?.startSensitivity),
endSensitivity: asSensitivity(raw?.endSensitivity),
enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
};
}
function resolveEnvApiKey(): string | undefined {
return trimToUndefined(process.env.GEMINI_API_KEY) ?? trimToUndefined(process.env.GOOGLE_API_KEY);
}
function mapStartSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): StartSensitivity | undefined {
switch (value) {
case "high":
return StartSensitivity.START_SENSITIVITY_HIGH;
case "low":
return StartSensitivity.START_SENSITIVITY_LOW;
default:
return undefined;
}
}
function mapEndSensitivity(
value: GoogleRealtimeSensitivity | undefined,
): EndSensitivity | undefined {
switch (value) {
case "high":
return EndSensitivity.END_SENSITIVITY_HIGH;
case "low":
return EndSensitivity.END_SENSITIVITY_LOW;
default:
return undefined;
}
}
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
if (config.thinkingLevel) {
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
}
if (typeof config.thinkingBudget === "number") {
return { thinkingBudget: config.thinkingBudget };
}
return undefined;
}
function buildRealtimeInputConfig(
config: GoogleRealtimeVoiceBridgeConfig,
): RealtimeInputConfig | undefined {
const startSensitivity = mapStartSensitivity(config.startSensitivity);
const endSensitivity = mapEndSensitivity(config.endSensitivity);
const automaticActivityDetection = {
...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
...(typeof config.prefixPaddingMs === "number"
? { prefixPaddingMs: Math.max(0, Math.floor(config.prefixPaddingMs)) }
: {}),
...(typeof config.silenceDurationMs === "number"
? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
: {}),
};
return Object.keys(automaticActivityDetection).length > 0
? { automaticActivityDetection }
: undefined;
}
function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
return (tools ?? []).map((tool) => ({
name: tool.name,
description: tool.description,
parametersJsonSchema: tool.parameters,
}));
}
function parsePcmSampleRate(mimeType: string | undefined): number {
const match = mimeType?.match(/(?:^|[;,\s])rate=(\d+)/i);
const parsed = match ? Number.parseInt(match[1] ?? "", 10) : Number.NaN;
return Number.isFinite(parsed) && parsed > 0 ? parsed : 24_000;
}
function isMulawSilence(audio: Buffer): boolean {
return audio.length > 0 && audio.every((sample) => sample === 0xff);
}
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
private session: GoogleLiveSession | null = null;
private connected = false;
private sessionConfigured = false;
private intentionallyClosed = false;
private pendingAudio: Buffer[] = [];
private sessionReadyFired = false;
private consecutiveSilenceMs = 0;
private audioStreamEnded = false;
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {}
async connect(): Promise<void> {
this.intentionallyClosed = false;
this.sessionConfigured = false;
this.sessionReadyFired = false;
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
const ai = createGoogleGenAI({
apiKey: this.config.apiKey,
httpOptions: {
apiVersion: this.config.apiVersion ?? GOOGLE_REALTIME_DEFAULT_API_VERSION,
},
});
const functionDeclarations = buildFunctionDeclarations(this.config.tools);
this.session = (await ai.live.connect({
model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
config: {
responseModalities: [Modality.AUDIO],
...(typeof this.config.temperature === "number" && this.config.temperature > 0
? { temperature: this.config.temperature }
: {}),
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: this.config.voice ?? GOOGLE_REALTIME_DEFAULT_VOICE,
},
},
},
systemInstruction: this.config.instructions,
...(functionDeclarations.length > 0 ? { tools: [{ functionDeclarations }] } : {}),
...(this.realtimeInputConfig ? { realtimeInputConfig: this.realtimeInputConfig } : {}),
inputAudioTranscription: {},
outputAudioTranscription: {},
...(typeof this.config.enableAffectiveDialog === "boolean"
? { enableAffectiveDialog: this.config.enableAffectiveDialog }
: {}),
...(this.thinkingConfig ? { thinkingConfig: this.thinkingConfig } : {}),
},
callbacks: {
onopen: () => {
this.connected = true;
},
onmessage: (message) => {
this.handleMessage(message);
},
onerror: (event) => {
const error =
event.error instanceof Error
? event.error
: new Error(
typeof event.message === "string" ? event.message : "Google Live API error",
);
this.config.onError?.(error);
},
onclose: () => {
this.connected = false;
this.sessionConfigured = false;
const reason = this.intentionallyClosed ? "completed" : "error";
this.session = null;
this.config.onClose?.(reason);
},
},
})) as GoogleLiveSession;
}
sendAudio(audio: Buffer): void {
if (!this.session || !this.connected || !this.sessionConfigured) {
if (this.pendingAudio.length < MAX_PENDING_AUDIO_CHUNKS) {
this.pendingAudio.push(audio);
}
return;
}
const silent = isMulawSilence(audio);
if (silent && this.audioStreamEnded) {
return;
}
if (!silent) {
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
}
const pcm16k = resamplePcm(
mulawToPcm(audio),
TELEPHONY_SAMPLE_RATE,
GOOGLE_REALTIME_INPUT_SAMPLE_RATE,
);
this.session.sendRealtimeInput({
audio: {
data: pcm16k.toString("base64"),
mimeType: `audio/pcm;rate=${GOOGLE_REALTIME_INPUT_SAMPLE_RATE}`,
},
});
if (!silent) {
return;
}
const silenceThresholdMs =
typeof this.config.silenceDurationMs === "number"
? Math.max(0, Math.floor(this.config.silenceDurationMs))
: DEFAULT_AUDIO_STREAM_END_SILENCE_MS;
this.consecutiveSilenceMs += Math.round((audio.length / TELEPHONY_SAMPLE_RATE) * 1000);
if (!this.audioStreamEnded && this.consecutiveSilenceMs >= silenceThresholdMs) {
this.session.sendRealtimeInput({ audioStreamEnd: true });
this.audioStreamEnded = true;
}
}
setMediaTimestamp(_ts: number): void {}
sendUserMessage(text: string): void {
const normalized = text.trim();
if (!normalized || !this.session || !this.connected || !this.sessionConfigured) {
return;
}
this.session.sendClientContent({
turns: [{ role: "user", parts: [{ text: normalized }] }],
turnComplete: true,
});
}
triggerGreeting(instructions?: string): void {
const greetingPrompt =
instructions?.trim() || "Start the call now. Greet the caller naturally and keep it brief.";
this.sendUserMessage(greetingPrompt);
}
submitToolResult(callId: string, result: unknown): void {
if (!this.session) {
return;
}
this.session.sendToolResponse({
functionResponses: [
{
id: callId,
response:
result && typeof result === "object"
? (result as Record<string, unknown>)
: { output: result },
},
],
});
}
acknowledgeMark(): void {}
close(): void {
this.intentionallyClosed = true;
this.connected = false;
this.sessionConfigured = false;
this.pendingAudio = [];
this.consecutiveSilenceMs = 0;
this.audioStreamEnded = false;
const session = this.session;
this.session = null;
session?.close();
}
isConnected(): boolean {
return this.connected && this.sessionConfigured;
}
private handleMessage(message: LiveServerMessage): void {
if (message.setupComplete) {
this.handleSetupComplete();
}
if (message.serverContent) {
this.handleServerContent(message.serverContent);
}
if (message.toolCall) {
this.handleToolCall(message.toolCall);
}
}
private handleSetupComplete(): void {
this.sessionConfigured = true;
for (const chunk of this.pendingAudio.splice(0)) {
this.sendAudio(chunk);
}
if (!this.sessionReadyFired) {
this.sessionReadyFired = true;
this.config.onReady?.();
}
}
private handleServerContent(content: LiveServerContent): void {
if (content.interrupted) {
this.config.onClearAudio();
}
if (content.inputTranscription?.text) {
this.config.onTranscript?.(
"user",
content.inputTranscription.text,
content.inputTranscription.finished ?? false,
);
}
if (content.outputTranscription?.text) {
this.config.onTranscript?.(
"assistant",
content.outputTranscription.text,
content.outputTranscription.finished ?? false,
);
}
let emittedAssistantText = false;
for (const part of content.modelTurn?.parts ?? []) {
if (part.inlineData?.data) {
const pcm = Buffer.from(part.inlineData.data, "base64");
const sampleRate = parsePcmSampleRate(part.inlineData.mimeType);
const muLaw = convertPcmToMulaw8k(pcm, sampleRate);
if (muLaw.length > 0) {
this.config.onAudio(muLaw);
this.config.onMark?.(`audio-${randomUUID()}`);
}
continue;
}
if (part.thought) {
continue;
}
if (!content.outputTranscription?.text && typeof part.text === "string" && part.text.trim()) {
emittedAssistantText = true;
this.config.onTranscript?.("assistant", part.text, content.turnComplete ?? false);
}
}
if (!emittedAssistantText && content.turnComplete && content.waitingForInput === false) {
return;
}
}
private handleToolCall(toolCall: LiveServerToolCall): void {
for (const call of toolCall.functionCalls ?? []) {
const name = call.name?.trim();
if (!name) {
continue;
}
const callId = call.id?.trim() || `google-live-${randomUUID()}`;
this.config.onToolCall?.({
itemId: callId,
callId,
name,
args: call.args ?? {},
});
}
}
private get realtimeInputConfig(): RealtimeInputConfig | undefined {
return buildRealtimeInputConfig(this.config);
}
private get thinkingConfig(): ThinkingConfig | undefined {
return buildThinkingConfig(this.config);
}
}
export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
return {
id: "google",
label: "Google Live Voice",
autoSelectOrder: 20,
resolveConfig: ({ cfg, rawConfig }) => normalizeProviderConfig(rawConfig, cfg),
isConfigured: ({ providerConfig }) =>
Boolean(normalizeProviderConfig(providerConfig).apiKey || resolveEnvApiKey()),
createBridge: (req) => {
const config = normalizeProviderConfig(req.providerConfig);
const apiKey = config.apiKey || resolveEnvApiKey();
if (!apiKey) {
throw new Error("Google Gemini API key missing");
}
return new GoogleRealtimeVoiceBridge({
...req,
apiKey,
model: config.model,
voice: config.voice,
temperature: config.temperature,
apiVersion: config.apiVersion,
prefixPaddingMs: config.prefixPaddingMs,
silenceDurationMs: config.silenceDurationMs,
startSensitivity: config.startSensitivity,
endSensitivity: config.endSensitivity,
enableAffectiveDialog: config.enableAffectiveDialog,
thinkingLevel: config.thinkingLevel,
thinkingBudget: config.thinkingBudget,
});
},
};
}
export {
GOOGLE_REALTIME_DEFAULT_API_VERSION,
GOOGLE_REALTIME_DEFAULT_MODEL,
GOOGLE_REALTIME_DEFAULT_VOICE,
};
export type { GoogleRealtimeVoiceProviderConfig };

View File

@@ -1,105 +1,8 @@
const TELEPHONY_SAMPLE_RATE = 8000;
const RESAMPLE_FILTER_TAPS = 31;
const RESAMPLE_CUTOFF_GUARD = 0.94;
function clamp16(value: number): number {
return Math.max(-32768, Math.min(32767, value));
}
function sinc(x: number): number {
if (x === 0) {
return 1;
}
return Math.sin(Math.PI * x) / (Math.PI * x);
}
/**
* Build a finite low-pass kernel centered on `srcPos`.
* The kernel is windowed (Hann) to reduce ringing artifacts.
*/
function sampleBandlimited(
input: Buffer,
inputSamples: number,
srcPos: number,
cutoffCyclesPerSample: number,
): number {
const half = Math.floor(RESAMPLE_FILTER_TAPS / 2);
const center = Math.floor(srcPos);
let weighted = 0;
let weightSum = 0;
for (let tap = -half; tap <= half; tap++) {
const sampleIndex = center + tap;
if (sampleIndex < 0 || sampleIndex >= inputSamples) {
continue;
}
const distance = sampleIndex - srcPos;
const lowPass = 2 * cutoffCyclesPerSample * sinc(2 * cutoffCyclesPerSample * distance);
const tapIndex = tap + half;
const window = 0.5 - 0.5 * Math.cos((2 * Math.PI * tapIndex) / (RESAMPLE_FILTER_TAPS - 1));
const coeff = lowPass * window;
weighted += input.readInt16LE(sampleIndex * 2) * coeff;
weightSum += coeff;
}
if (weightSum === 0) {
const nearest = Math.max(0, Math.min(inputSamples - 1, Math.round(srcPos)));
return input.readInt16LE(nearest * 2);
}
return weighted / weightSum;
}
/**
* Resample 16-bit PCM (little-endian mono) to 8kHz using a windowed low-pass kernel.
*/
export function resamplePcmTo8k(input: Buffer, inputSampleRate: number): Buffer {
if (inputSampleRate === TELEPHONY_SAMPLE_RATE) {
return input;
}
const inputSamples = Math.floor(input.length / 2);
if (inputSamples === 0) {
return Buffer.alloc(0);
}
const ratio = inputSampleRate / TELEPHONY_SAMPLE_RATE;
const outputSamples = Math.floor(inputSamples / ratio);
const output = Buffer.alloc(outputSamples * 2);
const maxCutoff = 0.5;
const downsampleCutoff = ratio > 1 ? maxCutoff / ratio : maxCutoff;
const cutoffCyclesPerSample = Math.max(0.01, downsampleCutoff * RESAMPLE_CUTOFF_GUARD);
for (let i = 0; i < outputSamples; i++) {
const srcPos = i * ratio;
const sample = Math.round(
sampleBandlimited(input, inputSamples, srcPos, cutoffCyclesPerSample),
);
output.writeInt16LE(clamp16(sample), i * 2);
}
return output;
}
/**
* Convert 16-bit PCM to 8-bit mu-law (G.711).
*/
export function pcmToMulaw(pcm: Buffer): Buffer {
const samples = Math.floor(pcm.length / 2);
const mulaw = Buffer.alloc(samples);
for (let i = 0; i < samples; i++) {
const sample = pcm.readInt16LE(i * 2);
mulaw[i] = linearToMulaw(sample);
}
return mulaw;
}
export function convertPcmToMulaw8k(pcm: Buffer, inputSampleRate: number): Buffer {
const pcm8k = resamplePcmTo8k(pcm, inputSampleRate);
return pcmToMulaw(pcm8k);
}
export {
convertPcmToMulaw8k,
pcmToMulaw,
resamplePcmTo8k,
} from "openclaw/plugin-sdk/realtime-voice";
/**
* Chunk audio buffer into 20ms frames for streaming (8kHz mono mu-law).
@@ -111,25 +14,3 @@ export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, vo
}
})();
}
function linearToMulaw(sample: number): number {
const BIAS = 132;
const CLIP = 32635;
const sign = sample < 0 ? 0x80 : 0;
if (sample < 0) {
sample = -sample;
}
if (sample > CLIP) {
sample = CLIP;
}
sample += BIAS;
let exponent = 7;
for (let expMask = 0x4000; (sample & expMask) === 0 && exponent > 0; exponent--) {
expMask >>= 1;
}
const mantissa = (sample >> (exponent + 3)) & 0x0f;
return ~(sign | (exponent << 4) | mantissa) & 0xff;
}

View File

@@ -341,10 +341,13 @@ describe("web session", () => {
sock.ev.emit("creds.update", {});
sock.ev.emit("creds.update", {});
await flushCredsUpdate();
expect(inFlight).toBe(1);
(release as (() => void) | null)?.();
try {
await vi.waitFor(() => {
expect(inFlight).toBe(1);
});
} finally {
(release as (() => void) | null)?.();
}
await waitForCredsSaveQueue(authDir);
@@ -389,13 +392,16 @@ describe("web session", () => {
onError,
);
await flushCredsUpdate();
try {
await vi.waitFor(() => {
expect(inFlightA).toBe(1);
expect(inFlightB).toBe(1);
});
} finally {
(releaseA as (() => void) | null)?.();
(releaseB as (() => void) | null)?.();
}
expect(inFlightA).toBe(1);
expect(inFlightB).toBe(1);
(releaseA as (() => void) | null)?.();
(releaseB as (() => void) | null)?.();
await Promise.all([waitForCredsSaveQueue(authDirA), waitForCredsSaveQueue(authDirB)]);
expect(inFlightA).toBe(0);

View File

@@ -152,7 +152,7 @@ describe("monitorZaloProvider lifecycle", () => {
abort.abort();
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1));
await vi.waitFor(() => expect(deleteWebhookMock).toHaveBeenCalledTimes(1), { timeout: 5000 });
expect(deleteWebhookMock).toHaveBeenCalledWith("test-token", undefined, 5000);
expect(settled).toBe(false);
expect(registry.httpRoutes).toHaveLength(2);