Files
openclaw/extensions/google/realtime-voice-provider.test.ts
2026-04-27 14:22:32 +01:00

678 lines
20 KiB
TypeScript

import { REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ } from "openclaw/plugin-sdk/realtime-voice";
import { beforeEach, describe, expect, it, vi } from "vitest";
import { buildGoogleRealtimeVoiceProvider } from "./realtime-voice-provider.js";
type MockGoogleLiveSession = {
close: ReturnType<typeof vi.fn>;
sendClientContent: ReturnType<typeof vi.fn>;
sendRealtimeInput: ReturnType<typeof vi.fn>;
sendToolResponse: ReturnType<typeof vi.fn>;
};
type MockGoogleLiveConnectParams = {
model: string;
config: Record<string, unknown>;
callbacks: {
onopen: () => void;
onmessage: (message: Record<string, unknown>) => void;
onerror: (event: { error?: unknown; message?: string }) => void;
onclose: () => void;
};
};
const { connectMock, createTokenMock, session } = vi.hoisted(() => {
const session: MockGoogleLiveSession = {
close: vi.fn(),
sendClientContent: vi.fn(),
sendRealtimeInput: vi.fn(),
sendToolResponse: vi.fn(),
};
const connectMock = vi.fn(async (_params: MockGoogleLiveConnectParams) => session);
const createTokenMock = vi.fn(async (_params: unknown) => ({
name: "auth_tokens/browser-session",
}));
return { connectMock, createTokenMock, session };
});
vi.mock("./google-genai-runtime.js", () => ({
createGoogleGenAI: vi.fn(() => ({
authTokens: {
create: createTokenMock,
},
live: {
connect: connectMock,
},
})),
}));
function lastConnectParams(): MockGoogleLiveConnectParams {
const params = connectMock.mock.calls.at(-1)?.[0];
if (!params) {
throw new Error("expected google live connect call");
}
return params;
}
describe("buildGoogleRealtimeVoiceProvider", () => {
beforeEach(() => {
connectMock.mockClear();
createTokenMock.mockClear();
session.close.mockClear();
session.sendClientContent.mockClear();
session.sendRealtimeInput.mockClear();
session.sendToolResponse.mockClear();
delete process.env.GEMINI_API_KEY;
delete process.env.GOOGLE_API_KEY;
});
it("normalizes provider config and cfg model-provider key fallback", () => {
const provider = buildGoogleRealtimeVoiceProvider();
const resolved = provider.resolveConfig?.({
cfg: {
models: {
providers: {
google: {
apiKey: "cfg-key",
},
},
},
} as never,
rawConfig: {
providers: {
google: {
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
silenceDurationMs: 700,
startSensitivity: "high",
activityHandling: "no_interruption",
turnCoverage: "turn_includes_only_activity",
automaticActivityDetectionDisabled: false,
},
},
},
});
expect(resolved).toEqual({
apiKey: "cfg-key",
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
apiVersion: undefined,
prefixPaddingMs: undefined,
silenceDurationMs: 700,
startSensitivity: "high",
endSensitivity: undefined,
activityHandling: "no-interruption",
turnCoverage: "only-activity",
automaticActivityDetectionDisabled: false,
enableAffectiveDialog: undefined,
thinkingLevel: undefined,
thinkingBudget: undefined,
});
});
it("connects with Google Live setup config and tool declarations", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
model: "gemini-live-2.5-flash-preview",
voice: "Kore",
temperature: 0.3,
startSensitivity: "low",
endSensitivity: "low",
activityHandling: "no-interruption",
turnCoverage: "only-activity",
},
instructions: "Speak briefly.",
tools: [
{
type: "function",
name: "lookup",
description: "Look something up",
parameters: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
{
type: "function",
name: "openclaw_agent_consult",
description: "Ask OpenClaw",
parameters: {
type: "object",
properties: {
question: { type: "string" },
},
required: ["question"],
},
},
],
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(connectMock).toHaveBeenCalledTimes(1);
expect(lastConnectParams()).toMatchObject({
model: "gemini-live-2.5-flash-preview",
config: {
responseModalities: ["AUDIO"],
temperature: 0.3,
systemInstruction: "Speak briefly.",
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Kore",
},
},
},
outputAudioTranscription: {},
realtimeInputConfig: {
activityHandling: "NO_INTERRUPTION",
automaticActivityDetection: {
startOfSpeechSensitivity: "START_SENSITIVITY_LOW",
endOfSpeechSensitivity: "END_SENSITIVITY_LOW",
},
turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY",
},
tools: [
{
functionDeclarations: [
{
name: "lookup",
description: "Look something up",
parametersJsonSchema: {
type: "object",
properties: {
query: { type: "string" },
},
required: ["query"],
},
},
{
name: "openclaw_agent_consult",
description: "Ask OpenClaw",
parametersJsonSchema: {
type: "object",
properties: {
question: { type: "string" },
},
required: ["question"],
},
behavior: "NON_BLOCKING",
},
],
},
],
},
});
});
it("omits zero temperature for native audio responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
temperature: 0,
},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(lastConnectParams().config).not.toHaveProperty("temperature");
});
it("creates constrained browser sessions for Google Live Talk", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const session = await provider.createBrowserSession?.({
providerConfig: {
apiKey: "gemini-key",
model: "gemini-live-2.5-flash-preview",
voice: "Puck",
temperature: 0.4,
},
instructions: "Speak briefly.",
tools: [
{
type: "function",
name: "openclaw_agent_consult",
description: "Ask OpenClaw",
parameters: {
type: "object",
properties: {
question: { type: "string" },
},
required: ["question"],
},
},
],
});
expect(createTokenMock).toHaveBeenCalledTimes(1);
expect(createTokenMock.mock.calls[0]?.[0]).toMatchObject({
config: {
uses: 1,
liveConnectConstraints: {
model: "gemini-live-2.5-flash-preview",
config: {
responseModalities: ["AUDIO"],
temperature: 0.4,
systemInstruction: "Speak briefly.",
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
tools: [
{
functionDeclarations: [
{
name: "openclaw_agent_consult",
behavior: "NON_BLOCKING",
},
],
},
],
},
},
},
});
expect(session).toMatchObject({
provider: "google",
transport: "json-pcm-websocket",
protocol: "google-live-bidi",
clientSecret: "auth_tokens/browser-session",
websocketUrl:
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained",
audio: {
inputEncoding: "pcm16",
inputSampleRateHz: 16000,
outputEncoding: "pcm16",
outputSampleRateHz: 24000,
},
initialMessage: {
setup: {
model: "models/gemini-live-2.5-flash-preview",
generationConfig: {
responseModalities: ["AUDIO"],
},
},
},
});
});
it("waits for setup completion before draining audio and firing ready", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onReady = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onReady,
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
bridge.sendAudio(Buffer.from([0xff, 0xff]));
expect(session.sendRealtimeInput).not.toHaveBeenCalled();
expect(onReady).not.toHaveBeenCalled();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
expect(onReady).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
data: expect.any(String),
mimeType: "audio/pcm;rate=16000",
});
});
it("marks the Google audio stream complete after sustained telephony silence", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key", silenceDurationMs: 60 },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
const silence20ms = Buffer.alloc(160, 0xff);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
const callsAfterStreamEnd = session.sendRealtimeInput.mock.calls.length;
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledTimes(callsAfterStreamEnd);
session.sendRealtimeInput.mockClear();
bridge.sendAudio(Buffer.alloc(160, 0x7f));
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
bridge.sendAudio(silence20ms);
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
});
it("accepts PCM16 24 kHz audio without the telephony mu-law hop", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
bridge.sendAudio(Buffer.alloc(480));
expect(session.sendRealtimeInput).toHaveBeenCalledWith({
audio: {
data: expect.any(String),
mimeType: "audio/pcm;rate=16000",
},
});
const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
expect(sent).toHaveLength(320);
});
it("can disable automatic VAD for manual activity signaling experiments", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: {
apiKey: "gemini-key",
automaticActivityDetectionDisabled: true,
},
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
expect(lastConnectParams().config).toMatchObject({
realtimeInputConfig: {
automaticActivityDetection: {
disabled: true,
},
},
});
});
it("sends text prompts as ordered client turns", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onopen();
lastConnectParams().callbacks.onmessage({ setupComplete: { sessionId: "session-1" } });
bridge.sendUserMessage?.(" Say hello. ");
expect(session.sendClientContent).toHaveBeenCalledWith({
turns: [{ role: "user", parts: [{ text: "Say hello." }] }],
turnComplete: true,
});
});
it("converts Google PCM output to mu-law audio", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onAudio = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio,
onClearAudio: vi.fn(),
});
const pcm24k = Buffer.alloc(480);
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
serverContent: {
modelTurn: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm24k.toString("base64"),
},
},
],
},
},
});
expect(onAudio).toHaveBeenCalledTimes(1);
expect(onAudio.mock.calls[0]?.[0]).toBeInstanceOf(Buffer);
expect(onAudio.mock.calls[0]?.[0]).toHaveLength(80);
});
it("can keep Google PCM output as PCM16 24 kHz audio", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onAudio = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
audioFormat: REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ,
onAudio,
onClearAudio: vi.fn(),
});
const pcm24k = Buffer.alloc(480);
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
serverContent: {
modelTurn: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm24k.toString("base64"),
},
},
],
},
},
});
expect(onAudio).toHaveBeenCalledTimes(1);
expect(onAudio.mock.calls[0]?.[0]).toEqual(pcm24k);
});
it("does not forward Google thought text as assistant transcript", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onTranscript = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onTranscript,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: {},
serverContent: {
modelTurn: {
parts: [{ text: "internal reasoning", thought: true }],
},
},
});
expect(onTranscript).not.toHaveBeenCalled();
});
it("forwards Live API tool calls and submits matching function responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onToolCall = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onToolCall,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
toolCall: {
functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
},
});
expect(onToolCall).toHaveBeenCalledWith({
itemId: "call-1",
callId: "call-1",
name: "lookup",
args: { query: "hi" },
});
bridge.submitToolResult("call-1", { result: "ok" });
expect(session.sendToolResponse).toHaveBeenCalledWith({
functionResponses: [
{
id: "call-1",
name: "lookup",
response: { result: "ok" },
},
],
});
});
it("keeps Google Live consult calls open after continuing tool responses", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onToolCall: vi.fn(),
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
toolCall: {
functionCalls: [
{ id: "consult-call", name: "openclaw_agent_consult", args: { prompt: "hi" } },
],
},
});
bridge.submitToolResult(
"consult-call",
{ status: "working", message: "Tell the participant you are checking." },
{ willContinue: true },
);
bridge.submitToolResult("consult-call", { text: "The meeting starts at 3." });
expect(session.sendToolResponse).toHaveBeenNthCalledWith(1, {
functionResponses: [
{
id: "consult-call",
name: "openclaw_agent_consult",
scheduling: "WHEN_IDLE",
willContinue: true,
response: { status: "working", message: "Tell the participant you are checking." },
},
],
});
expect(session.sendToolResponse).toHaveBeenNthCalledWith(2, {
functionResponses: [
{
id: "consult-call",
name: "openclaw_agent_consult",
scheduling: "WHEN_IDLE",
response: { text: "The meeting starts at 3." },
},
],
});
});
it("does not send malformed Live API tool responses without a matching call name", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onError = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onError,
});
await bridge.connect();
bridge.submitToolResult("missing-call", { result: "ok" });
expect(session.sendToolResponse).not.toHaveBeenCalled();
expect(onError).toHaveBeenCalledWith(
expect.objectContaining({
message:
"Google Live function response is missing a matching function call for missing-call",
}),
);
});
it("reports Google Live tool response send failures without losing the call name", async () => {
const provider = buildGoogleRealtimeVoiceProvider();
const onError = vi.fn();
const bridge = provider.createBridge({
providerConfig: { apiKey: "gemini-key" },
onAudio: vi.fn(),
onClearAudio: vi.fn(),
onError,
});
await bridge.connect();
lastConnectParams().callbacks.onmessage({
setupComplete: { sessionId: "session-1" },
toolCall: {
functionCalls: [{ id: "call-1", name: "lookup", args: { query: "hi" } }],
},
});
const sendError = new Error("SDK send failed");
session.sendToolResponse.mockImplementationOnce(() => {
throw sendError;
});
bridge.submitToolResult("call-1", ["retryable"]);
expect(onError).toHaveBeenCalledWith(sendError);
bridge.submitToolResult("call-1", { result: "ok" });
expect(session.sendToolResponse).toHaveBeenLastCalledWith({
functionResponses: [
{
id: "call-1",
name: "lookup",
response: { result: "ok" },
},
],
});
});
});