mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 12:30:44 +00:00
feat(voice-call): improve realtime Meet voice agent
* feat(voice-call): inject agent context into realtime voice * fix(voice-call): stabilize realtime meet audio * fix(voice-call): delegate realtime consults to agent * Improve realtime Meet voice consult routing * Pin voice consult delivery to call session * Move voice changelog entries to changes * fix(voice-call): isolate final realtime transcripts * test(voice-call): trim redundant realtime coverage
This commit is contained in:
@@ -3,13 +3,16 @@ import type {
|
||||
ProviderReplaySessionEntry,
|
||||
ProviderSanitizeReplayHistoryContext,
|
||||
} from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { createTestPluginApi } from "openclaw/plugin-sdk/plugin-test-api";
|
||||
import {
|
||||
registerProviderPlugin,
|
||||
requireRegisteredProvider,
|
||||
} from "openclaw/plugin-sdk/plugin-test-runtime";
|
||||
import { createCapturedThinkingConfigStream } from "openclaw/plugin-sdk/provider-test-contracts";
|
||||
import type { RealtimeVoiceProviderPlugin } from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { registerGoogleGeminiCliProvider } from "./gemini-cli-provider.js";
|
||||
import googlePlugin from "./index.js";
|
||||
import { registerGoogleProvider } from "./provider-registration.js";
|
||||
|
||||
const googleProviderPlugin = {
|
||||
@@ -226,4 +229,26 @@ describe("google provider plugin hooks", () => {
|
||||
expect(googleProvider.buildReplayPolicy).toBe(cliProvider.buildReplayPolicy);
|
||||
expect(googleProvider.wrapStreamFn).toBe(cliProvider.wrapStreamFn);
|
||||
});
|
||||
|
||||
it("buffers early realtime audio while the lazy Google bridge loads", () => {
|
||||
let realtimeProvider: RealtimeVoiceProviderPlugin | undefined;
|
||||
googlePlugin.register(
|
||||
createTestPluginApi({
|
||||
registerRealtimeVoiceProvider(provider) {
|
||||
realtimeProvider = provider;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const bridge = realtimeProvider?.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio() {},
|
||||
onClearAudio() {},
|
||||
});
|
||||
|
||||
expect(bridge).toBeDefined();
|
||||
expect(() => bridge?.sendAudio(Buffer.alloc(160))).not.toThrow();
|
||||
expect(() => bridge?.setMediaTimestamp(20)).not.toThrow();
|
||||
expect(() => bridge?.sendUserMessage?.("hello")).not.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -200,11 +200,18 @@ function resolveGoogleRealtimeEnvApiKey(): string | undefined {
|
||||
);
|
||||
}
|
||||
|
||||
const GOOGLE_REALTIME_LAZY_MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
|
||||
function createLazyGoogleRealtimeVoiceBridge(
|
||||
req: RealtimeVoiceBridgeCreateRequest,
|
||||
): RealtimeVoiceBridge {
|
||||
let bridge: RealtimeVoiceBridge | undefined;
|
||||
let bridgePromise: Promise<RealtimeVoiceBridge> | undefined;
|
||||
let closed = false;
|
||||
let latestMediaTimestamp: number | undefined;
|
||||
let pendingGreeting: string | undefined;
|
||||
const pendingAudio: Buffer[] = [];
|
||||
const pendingUserMessages: string[] = [];
|
||||
const loadBridge = async () => {
|
||||
if (!bridgePromise) {
|
||||
bridgePromise = loadGoogleRealtimeVoiceProvider().then((provider) =>
|
||||
@@ -220,20 +227,78 @@ function createLazyGoogleRealtimeVoiceBridge(
|
||||
}
|
||||
return bridge;
|
||||
};
|
||||
const flushPending = (loadedBridge: RealtimeVoiceBridge) => {
|
||||
if (typeof latestMediaTimestamp === "number") {
|
||||
loadedBridge.setMediaTimestamp(latestMediaTimestamp);
|
||||
}
|
||||
for (const audio of pendingAudio.splice(0)) {
|
||||
loadedBridge.sendAudio(audio);
|
||||
}
|
||||
for (const text of pendingUserMessages.splice(0)) {
|
||||
loadedBridge.sendUserMessage?.(text);
|
||||
}
|
||||
if (pendingGreeting !== undefined) {
|
||||
const greeting = pendingGreeting;
|
||||
pendingGreeting = undefined;
|
||||
loadedBridge.triggerGreeting?.(greeting);
|
||||
}
|
||||
};
|
||||
return {
|
||||
supportsToolResultContinuation: true,
|
||||
connect: async () => {
|
||||
await (await loadBridge()).connect();
|
||||
const loadedBridge = await loadBridge();
|
||||
if (closed) {
|
||||
loadedBridge.close();
|
||||
return;
|
||||
}
|
||||
await loadedBridge.connect();
|
||||
flushPending(loadedBridge);
|
||||
},
|
||||
sendAudio: (audio) => {
|
||||
if (bridge) {
|
||||
bridge.sendAudio(audio);
|
||||
return;
|
||||
}
|
||||
if (!closed) {
|
||||
if (pendingAudio.length >= GOOGLE_REALTIME_LAZY_MAX_PENDING_AUDIO_CHUNKS) {
|
||||
pendingAudio.shift();
|
||||
}
|
||||
pendingAudio.push(audio);
|
||||
}
|
||||
},
|
||||
setMediaTimestamp: (ts) => {
|
||||
latestMediaTimestamp = ts;
|
||||
bridge?.setMediaTimestamp(ts);
|
||||
},
|
||||
sendUserMessage: (text) => {
|
||||
if (bridge) {
|
||||
bridge.sendUserMessage?.(text);
|
||||
return;
|
||||
}
|
||||
if (!closed) {
|
||||
pendingUserMessages.push(text);
|
||||
}
|
||||
},
|
||||
triggerGreeting: (instructions) => {
|
||||
if (bridge) {
|
||||
bridge.triggerGreeting?.(instructions);
|
||||
return;
|
||||
}
|
||||
if (!closed) {
|
||||
pendingGreeting = instructions;
|
||||
}
|
||||
},
|
||||
sendAudio: (audio) => requireBridge().sendAudio(audio),
|
||||
setMediaTimestamp: (ts) => requireBridge().setMediaTimestamp(ts),
|
||||
sendUserMessage: (text) => requireBridge().sendUserMessage?.(text),
|
||||
triggerGreeting: (instructions) => requireBridge().triggerGreeting?.(instructions),
|
||||
handleBargeIn: (options) => requireBridge().handleBargeIn?.(options),
|
||||
submitToolResult: (callId, result, options) =>
|
||||
requireBridge().submitToolResult(callId, result, options),
|
||||
acknowledgeMark: () => requireBridge().acknowledgeMark(),
|
||||
close: () => bridge?.close(),
|
||||
close: () => {
|
||||
closed = true;
|
||||
pendingAudio.length = 0;
|
||||
pendingUserMessages.length = 0;
|
||||
pendingGreeting = undefined;
|
||||
bridge?.close();
|
||||
},
|
||||
isConnected: () => bridge?.isConnected() ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ type MockGoogleLiveConnectParams = {
|
||||
onopen: () => void;
|
||||
onmessage: (message: Record<string, unknown>) => void;
|
||||
onerror: (event: { error?: unknown; message?: string }) => void;
|
||||
onclose: () => void;
|
||||
onclose: (event?: { code?: number; reason?: string; wasClean?: boolean }) => void;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -352,6 +352,47 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(lastConnectParams().config.sessionResumption).toEqual({ handle: "resume-1" });
|
||||
});
|
||||
|
||||
it("reconnects unexpected Google Live closes with the latest resumption handle", async () => {
|
||||
vi.useFakeTimers();
|
||||
try {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onClose = vi.fn();
|
||||
const onError = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "gemini-key" },
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
onClose,
|
||||
onError,
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
lastConnectParams().callbacks.onmessage({
|
||||
setupComplete: { sessionId: "session-1" },
|
||||
sessionResumptionUpdate: { resumable: true, newHandle: "resume-1" },
|
||||
});
|
||||
lastConnectParams().callbacks.onclose({
|
||||
code: 1011,
|
||||
reason: "temporary upstream close",
|
||||
wasClean: false,
|
||||
});
|
||||
|
||||
expect(onClose).not.toHaveBeenCalled();
|
||||
expect(onError).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
message: expect.stringContaining("reconnecting 1/3"),
|
||||
}),
|
||||
);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(250);
|
||||
|
||||
expect(connectMock).toHaveBeenCalledTimes(2);
|
||||
expect(lastConnectParams().config.sessionResumption).toEqual({ handle: "resume-1" });
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
});
|
||||
|
||||
it("waits for setup completion before draining audio and firing ready", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const onReady = vi.fn();
|
||||
|
||||
@@ -50,6 +50,9 @@ const MAX_PENDING_AUDIO_CHUNKS = 320;
|
||||
const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 500;
|
||||
const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
|
||||
const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;
|
||||
const GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS = 3;
|
||||
const GOOGLE_REALTIME_RECONNECT_BASE_DELAY_MS = 250;
|
||||
const GOOGLE_REALTIME_RECONNECT_MAX_DELAY_MS = 2_000;
|
||||
const MULAW_LINEAR_SAMPLES = new Int16Array(256);
|
||||
|
||||
for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) {
|
||||
@@ -401,6 +404,24 @@ function isPcm16Silence(audio: Buffer): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
function formatGoogleLiveCloseEvent(
|
||||
event:
|
||||
| {
|
||||
code?: number;
|
||||
reason?: string;
|
||||
wasClean?: boolean;
|
||||
}
|
||||
| undefined,
|
||||
): string {
|
||||
if (!event) {
|
||||
return "code=unknown reason=unknown";
|
||||
}
|
||||
const code = typeof event.code === "number" ? event.code : "unknown";
|
||||
const reason = event.reason?.trim() || "none";
|
||||
const clean = typeof event.wasClean === "boolean" ? ` clean=${event.wasClean}` : "";
|
||||
return `code=${code} reason=${reason}${clean}`;
|
||||
}
|
||||
|
||||
class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
readonly supportsToolResultContinuation = true;
|
||||
|
||||
@@ -415,6 +436,8 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private pendingFunctionNames = new Map<string, string>();
|
||||
private readonly audioFormat: RealtimeVoiceAudioFormat;
|
||||
private resumptionHandle: string | undefined;
|
||||
private reconnectAttempts = 0;
|
||||
private reconnectTimer: ReturnType<typeof setTimeout> | undefined;
|
||||
|
||||
constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {
|
||||
this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
|
||||
@@ -464,13 +487,23 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
);
|
||||
this.config.onError?.(error);
|
||||
},
|
||||
onclose: () => {
|
||||
onclose: (event) => {
|
||||
this.connected = false;
|
||||
this.sessionConfigured = false;
|
||||
this.pendingFunctionNames.clear();
|
||||
const reason = this.intentionallyClosed ? "completed" : "error";
|
||||
this.session = null;
|
||||
this.config.onClose?.(reason);
|
||||
if (this.intentionallyClosed) {
|
||||
this.config.onClose?.("completed");
|
||||
return;
|
||||
}
|
||||
const closeDetails = formatGoogleLiveCloseEvent(event);
|
||||
if (this.scheduleReconnect(closeDetails)) {
|
||||
return;
|
||||
}
|
||||
this.config.onError?.(
|
||||
new Error(`Google Live session closed after reconnect attempts: ${closeDetails}`),
|
||||
);
|
||||
this.config.onClose?.("error");
|
||||
},
|
||||
},
|
||||
})) as GoogleLiveSession;
|
||||
@@ -596,6 +629,10 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
this.intentionallyClosed = true;
|
||||
this.connected = false;
|
||||
this.sessionConfigured = false;
|
||||
if (this.reconnectTimer) {
|
||||
clearTimeout(this.reconnectTimer);
|
||||
this.reconnectTimer = undefined;
|
||||
}
|
||||
this.pendingAudio = [];
|
||||
this.consecutiveSilenceMs = 0;
|
||||
this.audioStreamEnded = false;
|
||||
@@ -667,6 +704,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
|
||||
private handleSetupComplete(): void {
|
||||
this.sessionConfigured = true;
|
||||
this.reconnectAttempts = 0;
|
||||
for (const chunk of this.pendingAudio.splice(0)) {
|
||||
this.sendAudio(chunk);
|
||||
}
|
||||
@@ -739,6 +777,36 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private scheduleReconnect(closeDetails: string): boolean {
|
||||
if (this.reconnectAttempts >= GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS) {
|
||||
return false;
|
||||
}
|
||||
const attempt = ++this.reconnectAttempts;
|
||||
const delayMs = Math.min(
|
||||
GOOGLE_REALTIME_RECONNECT_MAX_DELAY_MS,
|
||||
GOOGLE_REALTIME_RECONNECT_BASE_DELAY_MS * 2 ** (attempt - 1),
|
||||
);
|
||||
this.config.onError?.(
|
||||
new Error(
|
||||
`Google Live session closed unexpectedly (${closeDetails}); reconnecting ${attempt}/${GOOGLE_REALTIME_RECONNECT_MAX_ATTEMPTS} in ${delayMs}ms`,
|
||||
),
|
||||
);
|
||||
this.reconnectTimer = setTimeout(() => {
|
||||
this.reconnectTimer = undefined;
|
||||
if (this.intentionallyClosed) {
|
||||
return;
|
||||
}
|
||||
this.connect().catch((error: unknown) => {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(message));
|
||||
if (!this.scheduleReconnect(`connect failed: ${message}`)) {
|
||||
this.config.onClose?.("error");
|
||||
}
|
||||
});
|
||||
}, delayMs);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
function convertMulaw8kToPcm16k(muLaw: Buffer): Buffer {
|
||||
|
||||
Reference in New Issue
Block a user