mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
Fix Google Meet realtime interruption playback (#72524)
Fixes #72523.
Remote proof:
- CI run 24980529154 passed on 29f825bea5.
- Blacksmith Testbox tbx_01kq6tsgbaxgstxmtearwy9n4w passed focused formatting, Google Meet tests, Google realtime provider tests, and extension test typecheck.
Thanks @BsnizND.
Co-authored-by: BSnizND <199837910+BsnizND@users.noreply.github.com>
This commit is contained in:
@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Web search: route plugin-scoped web_search SecretRefs through the active runtime config snapshot so provider execution receives resolved credentials across app/runtime paths, including `plugins.entries.brave.config.webSearch.apiKey`. Fixes #68690. Thanks @VACInc.
|
||||
- Voice Call: allow SecretRef-backed Twilio auth tokens and call-specific OpenAI/ElevenLabs TTS API keys through the plugin config surface. Fixes #68690. Thanks @joshavant.
|
||||
- Google Meet: clean stale chrome-node realtime audio bridges by URL before rejoining, expose active node bridge inspection, and tolerate transient node input pull failures instead of dropping the Meet session. Fixes #72371. (#72372) Thanks @BsnizND.
|
||||
- Google Meet: clear queued Gemini Live playback when realtime interruptions arrive, restart Chrome command-pair audio output after clears, and expose Google Live interruption/VAD config knobs for Meet and Voice Call realtime bridges. Fixes #72523. (#72524) Thanks @BsnizND.
|
||||
- Matrix/E2EE: stabilize recovery and broken-device QA flows while avoiding Matrix device-cleanup sync races that could leave shutdown-time crypto work running. Thanks @gumadeiras.
|
||||
- Cron: treat isolated run-level agent failures as job errors even when no reply payload is produced, synthesizing a safe error payload so model/provider failures increment error counters and trigger failure notifications instead of clearing as successful. Fixes #43604; carries forward #43631. Thanks @SPFAdvisors.
|
||||
- Cron: preserve exact `NO_REPLY` tool results from isolated jobs with empty final assistant turns as quiet successes instead of surfacing incomplete-turn errors. Fixes #68452; carries forward #68453. Thanks @anyech.
|
||||
|
||||
@@ -308,6 +308,9 @@ Gemini Live API for backend audio bridges such as Voice Call and Google Meet.
|
||||
| VAD start sensitivity | `...google.startSensitivity` | (unset) |
|
||||
| VAD end sensitivity | `...google.endSensitivity` | (unset) |
|
||||
| Silence duration | `...google.silenceDurationMs` | (unset) |
|
||||
| Activity handling | `...google.activityHandling` | Google default, `start-of-activity-interrupts` |
|
||||
| Turn coverage | `...google.turnCoverage` | Google default, `only-activity` |
|
||||
| Disable auto VAD | `...google.automaticActivityDetectionDisabled` | `false` |
|
||||
| API key | `...google.apiKey` | Falls back to `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` |
|
||||
|
||||
Example Voice Call realtime config:
|
||||
@@ -326,6 +329,8 @@ Example Voice Call realtime config:
|
||||
google: {
|
||||
model: "gemini-2.5-flash-native-audio-preview-12-2025",
|
||||
voice: "Kore",
|
||||
activityHandling: "start-of-activity-interrupts",
|
||||
turnCoverage: "only-activity",
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
@@ -217,6 +217,7 @@ type TestBridgeProcess = {
|
||||
killed: boolean;
|
||||
kill: ReturnType<typeof vi.fn>;
|
||||
on: EventEmitter["on"];
|
||||
emit: EventEmitter["emit"];
|
||||
};
|
||||
|
||||
describe("google-meet plugin", () => {
|
||||
@@ -1881,6 +1882,7 @@ describe("google-meet plugin", () => {
|
||||
let callbacks:
|
||||
| {
|
||||
onAudio: (audio: Buffer) => void;
|
||||
onClearAudio: () => void;
|
||||
onMark?: (markName: string) => void;
|
||||
onToolCall?: (event: {
|
||||
itemId: string;
|
||||
@@ -1916,6 +1918,7 @@ describe("google-meet plugin", () => {
|
||||
};
|
||||
const inputStdout = new PassThrough();
|
||||
const outputStdinWrites: Buffer[] = [];
|
||||
const replacementOutputStdinWrites: Buffer[] = [];
|
||||
const makeProcess = (stdio: {
|
||||
stdin?: { write(chunk: unknown): unknown } | null;
|
||||
stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null;
|
||||
@@ -1937,9 +1940,20 @@ describe("google-meet plugin", () => {
|
||||
done();
|
||||
},
|
||||
});
|
||||
const replacementOutputStdin = new Writable({
|
||||
write(chunk, _encoding, done) {
|
||||
replacementOutputStdinWrites.push(Buffer.from(chunk));
|
||||
done();
|
||||
},
|
||||
});
|
||||
const inputProcess = makeProcess({ stdout: inputStdout, stdin: null });
|
||||
const outputProcess = makeProcess({ stdin: outputStdin, stdout: null });
|
||||
const spawnMock = vi.fn().mockReturnValueOnce(outputProcess).mockReturnValueOnce(inputProcess);
|
||||
const replacementOutputProcess = makeProcess({ stdin: replacementOutputStdin, stdout: null });
|
||||
const spawnMock = vi
|
||||
.fn()
|
||||
.mockReturnValueOnce(outputProcess)
|
||||
.mockReturnValueOnce(inputProcess)
|
||||
.mockReturnValueOnce(replacementOutputProcess);
|
||||
const sessionStore: Record<string, unknown> = {};
|
||||
const runtime = {
|
||||
agent: {
|
||||
@@ -1977,6 +1991,8 @@ describe("google-meet plugin", () => {
|
||||
inputStdout.write(Buffer.from([1, 2, 3]));
|
||||
callbacks?.onAudio(Buffer.from([4, 5]));
|
||||
callbacks?.onMark?.("mark-1");
|
||||
callbacks?.onClearAudio();
|
||||
callbacks?.onAudio(Buffer.from([6, 7]));
|
||||
callbacks?.onReady?.();
|
||||
callbacks?.onToolCall?.({
|
||||
itemId: "item-1",
|
||||
@@ -1993,6 +2009,10 @@ describe("google-meet plugin", () => {
|
||||
});
|
||||
expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2, 3]));
|
||||
expect(outputStdinWrites).toEqual([Buffer.from([4, 5])]);
|
||||
expect(outputProcess.kill).toHaveBeenCalledWith("SIGTERM");
|
||||
expect(replacementOutputStdinWrites).toEqual([Buffer.from([6, 7])]);
|
||||
outputProcess.emit("error", new Error("stale output process failed after clear"));
|
||||
expect(bridge.close).not.toHaveBeenCalled();
|
||||
expect(bridge.acknowledgeMark).toHaveBeenCalled();
|
||||
expect(bridge.triggerGreeting).not.toHaveBeenCalled();
|
||||
handle.speak("Say exactly: hello from the meeting.");
|
||||
@@ -2003,7 +2023,8 @@ describe("google-meet plugin", () => {
|
||||
audioInputActive: true,
|
||||
audioOutputActive: true,
|
||||
lastInputBytes: 3,
|
||||
lastOutputBytes: 2,
|
||||
lastOutputBytes: 4,
|
||||
clearCount: 1,
|
||||
});
|
||||
expect(callbacks).toMatchObject({
|
||||
tools: [
|
||||
@@ -2035,6 +2056,7 @@ describe("google-meet plugin", () => {
|
||||
let callbacks:
|
||||
| {
|
||||
onAudio: (audio: Buffer) => void;
|
||||
onClearAudio: () => void;
|
||||
onToolCall?: (event: {
|
||||
itemId: string;
|
||||
callId: string;
|
||||
@@ -2114,6 +2136,7 @@ describe("google-meet plugin", () => {
|
||||
});
|
||||
|
||||
callbacks?.onAudio(Buffer.from([1, 2, 3]));
|
||||
callbacks?.onClearAudio();
|
||||
callbacks?.onReady?.();
|
||||
callbacks?.onToolCall?.({
|
||||
itemId: "item-1",
|
||||
@@ -2138,6 +2161,19 @@ describe("google-meet plugin", () => {
|
||||
}),
|
||||
);
|
||||
});
|
||||
await vi.waitFor(() => {
|
||||
expect(runtime.nodes.invoke).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
nodeId: "node-1",
|
||||
command: "googlemeet.chrome",
|
||||
params: {
|
||||
action: "clearAudio",
|
||||
bridgeId: "bridge-1",
|
||||
},
|
||||
timeoutMs: 5_000,
|
||||
}),
|
||||
);
|
||||
});
|
||||
await vi.waitFor(() => {
|
||||
expect(bridge.submitToolResult).toHaveBeenCalledWith("tool-call-1", {
|
||||
text: "Use the launch update.",
|
||||
@@ -2166,6 +2202,7 @@ describe("google-meet plugin", () => {
|
||||
audioOutputActive: true,
|
||||
lastInputBytes: 3,
|
||||
lastOutputBytes: 3,
|
||||
clearCount: 1,
|
||||
});
|
||||
|
||||
await handle.stop();
|
||||
|
||||
@@ -40,6 +40,83 @@ vi.mock("node:child_process", async (importOriginal) => {
|
||||
});
|
||||
|
||||
describe("google-meet node host bridge sessions", () => {
|
||||
it("clears output playback without closing the active bridge when the old output exits", async () => {
|
||||
const { handleGoogleMeetNodeHostCommand } = await import("./src/node-host.js");
|
||||
const originalPlatform = process.platform;
|
||||
children.length = 0;
|
||||
|
||||
Object.defineProperty(process, "platform", { configurable: true, value: "darwin" });
|
||||
try {
|
||||
const start = JSON.parse(
|
||||
await handleGoogleMeetNodeHostCommand(
|
||||
JSON.stringify({
|
||||
action: "start",
|
||||
url: "https://meet.google.com/xyz-abcd-uvw",
|
||||
mode: "realtime",
|
||||
launch: false,
|
||||
audioInputCommand: ["mock-rec"],
|
||||
audioOutputCommand: ["mock-play"],
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(children).toHaveLength(2);
|
||||
const firstOutput = children[0];
|
||||
|
||||
const cleared = JSON.parse(
|
||||
await handleGoogleMeetNodeHostCommand(
|
||||
JSON.stringify({
|
||||
action: "clearAudio",
|
||||
bridgeId: start.bridgeId,
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(cleared).toEqual({ bridgeId: start.bridgeId, ok: true, clearCount: 1 });
|
||||
expect(children).toHaveLength(3);
|
||||
expect(firstOutput?.kill).toHaveBeenCalledWith("SIGTERM");
|
||||
|
||||
firstOutput?.emit("error", new Error("stale output failed after clear"));
|
||||
firstOutput?.emit("exit", 0, "SIGTERM");
|
||||
|
||||
const status = JSON.parse(
|
||||
await handleGoogleMeetNodeHostCommand(
|
||||
JSON.stringify({
|
||||
action: "status",
|
||||
bridgeId: start.bridgeId,
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(status.bridge).toMatchObject({
|
||||
bridgeId: start.bridgeId,
|
||||
closed: false,
|
||||
clearCount: 1,
|
||||
});
|
||||
|
||||
const audio = Buffer.from([1, 2, 3]);
|
||||
await handleGoogleMeetNodeHostCommand(
|
||||
JSON.stringify({
|
||||
action: "pushAudio",
|
||||
bridgeId: start.bridgeId,
|
||||
base64: audio.toString("base64"),
|
||||
}),
|
||||
);
|
||||
|
||||
expect(children[2]?.stdin?.write).toHaveBeenCalledWith(audio);
|
||||
expect(firstOutput?.stdin?.write).not.toHaveBeenCalled();
|
||||
|
||||
await handleGoogleMeetNodeHostCommand(
|
||||
JSON.stringify({
|
||||
action: "stop",
|
||||
bridgeId: start.bridgeId,
|
||||
}),
|
||||
);
|
||||
} finally {
|
||||
Object.defineProperty(process, "platform", { configurable: true, value: originalPlatform });
|
||||
}
|
||||
});
|
||||
|
||||
it("lists active bridge sessions and hides closed sessions", async () => {
|
||||
const { handleGoogleMeetNodeHostCommand } = await import("./src/node-host.js");
|
||||
const originalPlatform = process.platform;
|
||||
|
||||
@@ -15,6 +15,7 @@ type NodeBridgeSession = {
|
||||
id: string;
|
||||
url?: string;
|
||||
mode?: string;
|
||||
outputCommand: { command: string; args: string[] };
|
||||
input?: ChildProcess;
|
||||
output?: ChildProcess;
|
||||
chunks: Buffer[];
|
||||
@@ -23,9 +24,11 @@ type NodeBridgeSession = {
|
||||
createdAt: string;
|
||||
lastInputAt?: string;
|
||||
lastOutputAt?: string;
|
||||
lastClearAt?: string;
|
||||
lastInputBytes: number;
|
||||
lastOutputBytes: number;
|
||||
closedAt?: string;
|
||||
clearCount: number;
|
||||
};
|
||||
|
||||
const sessions = new Map<string, NodeBridgeSession>();
|
||||
@@ -110,6 +113,25 @@ function stopSession(session: NodeBridgeSession) {
|
||||
wake(session);
|
||||
}
|
||||
|
||||
function attachOutputProcessHandlers(session: NodeBridgeSession, outputProcess: ChildProcess) {
|
||||
outputProcess.on("exit", () => {
|
||||
if (session.output === outputProcess) {
|
||||
stopSession(session);
|
||||
}
|
||||
});
|
||||
outputProcess.on("error", () => {
|
||||
if (session.output === outputProcess) {
|
||||
stopSession(session);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function startOutputProcess(command: { command: string; args: string[] }) {
|
||||
return spawn(command.command, command.args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
function startCommandPair(params: {
|
||||
inputCommand: string[];
|
||||
outputCommand: string[];
|
||||
@@ -122,16 +144,16 @@ function startCommandPair(params: {
|
||||
id: `meet_node_${randomUUID()}`,
|
||||
url: params.url,
|
||||
mode: params.mode,
|
||||
outputCommand: output,
|
||||
chunks: [],
|
||||
waiters: [],
|
||||
closed: false,
|
||||
createdAt: new Date().toISOString(),
|
||||
lastInputBytes: 0,
|
||||
lastOutputBytes: 0,
|
||||
clearCount: 0,
|
||||
};
|
||||
const outputProcess = spawn(output.command, output.args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
const outputProcess = startOutputProcess(output);
|
||||
const inputProcess = spawn(input.command, input.args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
@@ -148,9 +170,8 @@ function startCommandPair(params: {
|
||||
wake(session);
|
||||
});
|
||||
inputProcess.on("exit", () => stopSession(session));
|
||||
outputProcess.on("exit", () => stopSession(session));
|
||||
attachOutputProcessHandlers(session, outputProcess);
|
||||
inputProcess.on("error", () => stopSession(session));
|
||||
outputProcess.on("error", () => stopSession(session));
|
||||
sessions.set(session.id, session);
|
||||
return session;
|
||||
}
|
||||
@@ -224,6 +245,25 @@ function pushAudio(params: Record<string, unknown>) {
|
||||
return { bridgeId, ok: true };
|
||||
}
|
||||
|
||||
function clearAudio(params: Record<string, unknown>) {
|
||||
const bridgeId = readString(params.bridgeId);
|
||||
if (!bridgeId) {
|
||||
throw new Error("bridgeId required");
|
||||
}
|
||||
const session = sessions.get(bridgeId);
|
||||
if (!session || session.closed) {
|
||||
throw new Error(`bridge is not open: ${bridgeId}`);
|
||||
}
|
||||
const previousOutput = session.output;
|
||||
const outputProcess = startOutputProcess(session.outputCommand);
|
||||
session.output = outputProcess;
|
||||
attachOutputProcessHandlers(session, outputProcess);
|
||||
session.clearCount += 1;
|
||||
session.lastClearAt = new Date().toISOString();
|
||||
terminateChild(previousOutput);
|
||||
return { bridgeId, ok: true, clearCount: session.clearCount };
|
||||
}
|
||||
|
||||
function startChrome(params: Record<string, unknown>) {
|
||||
const url = readString(params.url);
|
||||
if (!url) {
|
||||
@@ -317,8 +357,11 @@ function bridgeStatus(params: Record<string, unknown>) {
|
||||
createdAt: session.createdAt,
|
||||
lastInputAt: session.lastInputAt,
|
||||
lastOutputAt: session.lastOutputAt,
|
||||
lastClearAt: session.lastClearAt,
|
||||
lastInputBytes: session.lastInputBytes,
|
||||
lastOutputBytes: session.lastOutputBytes,
|
||||
clearCount: session.clearCount,
|
||||
queuedInputChunks: session.chunks.length,
|
||||
}
|
||||
: bridgeId
|
||||
? { bridgeId, closed: true }
|
||||
@@ -438,6 +481,9 @@ export async function handleGoogleMeetNodeHostCommand(paramsJSON?: string | null
|
||||
case "pushAudio":
|
||||
result = pushAudio(params);
|
||||
break;
|
||||
case "clearAudio":
|
||||
result = clearAudio(params);
|
||||
break;
|
||||
case "stop":
|
||||
result = stopChrome(params);
|
||||
break;
|
||||
|
||||
@@ -50,10 +50,12 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
let realtimeReady = false;
|
||||
let lastInputAt: string | undefined;
|
||||
let lastOutputAt: string | undefined;
|
||||
let lastClearAt: string | undefined;
|
||||
let lastInputBytes = 0;
|
||||
let lastOutputBytes = 0;
|
||||
let consecutiveInputErrors = 0;
|
||||
let lastInputError: string | undefined;
|
||||
let clearCount = 0;
|
||||
const resolved = resolveGoogleMeetRealtimeProvider({
|
||||
config: params.config,
|
||||
fullConfig: params.fullConfig,
|
||||
@@ -118,6 +120,26 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
void stop();
|
||||
});
|
||||
},
|
||||
clearAudio: () => {
|
||||
lastClearAt = new Date().toISOString();
|
||||
clearCount += 1;
|
||||
void params.runtime.nodes
|
||||
.invoke({
|
||||
nodeId: params.nodeId,
|
||||
command: "googlemeet.chrome",
|
||||
params: {
|
||||
action: "clearAudio",
|
||||
bridgeId: params.bridgeId,
|
||||
},
|
||||
timeoutMs: 5_000,
|
||||
})
|
||||
.catch((error) => {
|
||||
params.logger.warn(
|
||||
`[google-meet] node audio clear failed: ${formatErrorMessage(error)}`,
|
||||
);
|
||||
void stop();
|
||||
});
|
||||
},
|
||||
},
|
||||
onTranscript: (role, text, isFinal) => {
|
||||
if (isFinal) {
|
||||
@@ -230,10 +252,12 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
audioOutputActive: lastOutputBytes > 0,
|
||||
lastInputAt,
|
||||
lastOutputAt,
|
||||
lastClearAt,
|
||||
lastInputBytes,
|
||||
lastOutputBytes,
|
||||
consecutiveInputErrors,
|
||||
lastInputError,
|
||||
clearCount,
|
||||
bridgeClosed: stopped,
|
||||
}),
|
||||
stop,
|
||||
|
||||
@@ -91,9 +91,11 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
const spawnFn: SpawnFn =
|
||||
params.spawn ??
|
||||
((command, args, options) => spawn(command, args, options) as unknown as BridgeProcess);
|
||||
const outputProcess = spawnFn(output.command, output.args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
const spawnOutputProcess = () =>
|
||||
spawnFn(output.command, output.args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
let outputProcess = spawnOutputProcess();
|
||||
const inputProcess = spawnFn(input.command, input.args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
@@ -104,6 +106,8 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
let lastOutputAt: string | undefined;
|
||||
let lastInputBytes = 0;
|
||||
let lastOutputBytes = 0;
|
||||
let lastClearAt: string | undefined;
|
||||
let clearCount = 0;
|
||||
|
||||
const stop = async () => {
|
||||
if (stopped) {
|
||||
@@ -125,26 +129,53 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
params.logger.warn(`[google-meet] ${label} failed: ${formatErrorMessage(error)}`);
|
||||
void stop();
|
||||
};
|
||||
const attachOutputProcessHandlers = (proc: BridgeProcess) => {
|
||||
proc.on("error", (error) => {
|
||||
if (proc !== outputProcess) {
|
||||
return;
|
||||
}
|
||||
fail("audio output command")(error);
|
||||
});
|
||||
proc.on("exit", (code, signal) => {
|
||||
if (proc !== outputProcess) {
|
||||
return;
|
||||
}
|
||||
if (!stopped) {
|
||||
params.logger.warn(
|
||||
`[google-meet] audio output command exited (${code ?? signal ?? "done"})`,
|
||||
);
|
||||
void stop();
|
||||
}
|
||||
});
|
||||
proc.stderr?.on("data", (chunk) => {
|
||||
params.logger.debug?.(`[google-meet] audio output: ${String(chunk).trim()}`);
|
||||
});
|
||||
};
|
||||
const clearOutputPlayback = () => {
|
||||
if (stopped) {
|
||||
return;
|
||||
}
|
||||
const previousOutput = outputProcess;
|
||||
outputProcess = spawnOutputProcess();
|
||||
attachOutputProcessHandlers(outputProcess);
|
||||
clearCount += 1;
|
||||
lastClearAt = new Date().toISOString();
|
||||
params.logger.debug?.(
|
||||
`[google-meet] cleared realtime audio output buffer by restarting playback command`,
|
||||
);
|
||||
previousOutput.kill("SIGTERM");
|
||||
};
|
||||
inputProcess.on("error", fail("audio input command"));
|
||||
outputProcess.on("error", fail("audio output command"));
|
||||
inputProcess.on("exit", (code, signal) => {
|
||||
if (!stopped) {
|
||||
params.logger.warn(`[google-meet] audio input command exited (${code ?? signal ?? "done"})`);
|
||||
void stop();
|
||||
}
|
||||
});
|
||||
outputProcess.on("exit", (code, signal) => {
|
||||
if (!stopped) {
|
||||
params.logger.warn(`[google-meet] audio output command exited (${code ?? signal ?? "done"})`);
|
||||
void stop();
|
||||
}
|
||||
});
|
||||
attachOutputProcessHandlers(outputProcess);
|
||||
inputProcess.stderr?.on("data", (chunk) => {
|
||||
params.logger.debug?.(`[google-meet] audio input: ${String(chunk).trim()}`);
|
||||
});
|
||||
outputProcess.stderr?.on("data", (chunk) => {
|
||||
params.logger.debug?.(`[google-meet] audio output: ${String(chunk).trim()}`);
|
||||
});
|
||||
|
||||
const resolved = resolveGoogleMeetRealtimeProvider({
|
||||
config: params.config,
|
||||
@@ -167,6 +198,7 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
lastOutputBytes += muLaw.byteLength;
|
||||
outputProcess.stdin?.write(muLaw);
|
||||
},
|
||||
clearAudio: clearOutputPlayback,
|
||||
},
|
||||
onTranscript: (role, text, isFinal) => {
|
||||
if (isFinal) {
|
||||
@@ -240,6 +272,8 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
lastOutputAt,
|
||||
lastInputBytes,
|
||||
lastOutputBytes,
|
||||
lastClearAt,
|
||||
clearCount,
|
||||
bridgeClosed: stopped,
|
||||
}),
|
||||
stop,
|
||||
|
||||
@@ -31,10 +31,13 @@ export type GoogleMeetChromeHealth = {
|
||||
audioOutputActive?: boolean;
|
||||
lastInputAt?: string;
|
||||
lastOutputAt?: string;
|
||||
lastClearAt?: string;
|
||||
lastInputBytes?: number;
|
||||
lastOutputBytes?: number;
|
||||
consecutiveInputErrors?: number;
|
||||
lastInputError?: string;
|
||||
clearCount?: number;
|
||||
queuedInputChunks?: number;
|
||||
browserUrl?: string;
|
||||
browserTitle?: string;
|
||||
bridgeClosed?: boolean;
|
||||
|
||||
@@ -77,6 +77,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
temperature: 0.4,
|
||||
silenceDurationMs: 700,
|
||||
startSensitivity: "high",
|
||||
activityHandling: "no_interruption",
|
||||
turnCoverage: "turn_includes_only_activity",
|
||||
automaticActivityDetectionDisabled: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -92,6 +95,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
silenceDurationMs: 700,
|
||||
startSensitivity: "high",
|
||||
endSensitivity: undefined,
|
||||
activityHandling: "no-interruption",
|
||||
turnCoverage: "only-activity",
|
||||
automaticActivityDetectionDisabled: false,
|
||||
enableAffectiveDialog: undefined,
|
||||
thinkingLevel: undefined,
|
||||
thinkingBudget: undefined,
|
||||
@@ -107,6 +113,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
voice: "Kore",
|
||||
temperature: 0.3,
|
||||
startSensitivity: "low",
|
||||
endSensitivity: "low",
|
||||
activityHandling: "no-interruption",
|
||||
turnCoverage: "only-activity",
|
||||
},
|
||||
instructions: "Speak briefly.",
|
||||
tools: [
|
||||
@@ -144,6 +153,14 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
},
|
||||
},
|
||||
outputAudioTranscription: {},
|
||||
realtimeInputConfig: {
|
||||
activityHandling: "NO_INTERRUPTION",
|
||||
automaticActivityDetection: {
|
||||
startOfSpeechSensitivity: "START_SENSITIVITY_LOW",
|
||||
endOfSpeechSensitivity: "END_SENSITIVITY_LOW",
|
||||
},
|
||||
turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY",
|
||||
},
|
||||
tools: [
|
||||
{
|
||||
functionDeclarations: [
|
||||
@@ -240,6 +257,28 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
|
||||
expect(session.sendRealtimeInput).toHaveBeenCalledWith({ audioStreamEnd: true });
|
||||
});
|
||||
|
||||
it("can disable automatic VAD for manual activity signaling experiments", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: {
|
||||
apiKey: "gemini-key",
|
||||
automaticActivityDetectionDisabled: true,
|
||||
},
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
|
||||
await bridge.connect();
|
||||
|
||||
expect(lastConnectParams().config).toMatchObject({
|
||||
realtimeInputConfig: {
|
||||
automaticActivityDetection: {
|
||||
disabled: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("sends text prompts as ordered client turns", async () => {
|
||||
const provider = buildGoogleRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import {
|
||||
ActivityHandling,
|
||||
EndSensitivity,
|
||||
Modality,
|
||||
StartSensitivity,
|
||||
TurnCoverage,
|
||||
type FunctionDeclaration,
|
||||
type FunctionResponse,
|
||||
type LiveServerContent,
|
||||
@@ -34,6 +36,8 @@ const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
|
||||
|
||||
type GoogleRealtimeSensitivity = "low" | "high";
|
||||
type GoogleRealtimeThinkingLevel = "minimal" | "low" | "medium" | "high";
|
||||
type GoogleRealtimeActivityHandling = "start-of-activity-interrupts" | "no-interruption";
|
||||
type GoogleRealtimeTurnCoverage = "only-activity" | "all-input" | "audio-activity-and-all-video";
|
||||
|
||||
type GoogleRealtimeVoiceProviderConfig = {
|
||||
apiKey?: string;
|
||||
@@ -45,6 +49,9 @@ type GoogleRealtimeVoiceProviderConfig = {
|
||||
silenceDurationMs?: number;
|
||||
startSensitivity?: GoogleRealtimeSensitivity;
|
||||
endSensitivity?: GoogleRealtimeSensitivity;
|
||||
activityHandling?: GoogleRealtimeActivityHandling;
|
||||
turnCoverage?: GoogleRealtimeTurnCoverage;
|
||||
automaticActivityDetectionDisabled?: boolean;
|
||||
enableAffectiveDialog?: boolean;
|
||||
thinkingLevel?: GoogleRealtimeThinkingLevel;
|
||||
thinkingBudget?: number;
|
||||
@@ -60,6 +67,9 @@ type GoogleRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
silenceDurationMs?: number;
|
||||
startSensitivity?: GoogleRealtimeSensitivity;
|
||||
endSensitivity?: GoogleRealtimeSensitivity;
|
||||
activityHandling?: GoogleRealtimeActivityHandling;
|
||||
turnCoverage?: GoogleRealtimeTurnCoverage;
|
||||
automaticActivityDetectionDisabled?: boolean;
|
||||
enableAffectiveDialog?: boolean;
|
||||
thinkingLevel?: GoogleRealtimeThinkingLevel;
|
||||
thinkingBudget?: number;
|
||||
@@ -105,6 +115,40 @@ function asThinkingLevel(value: unknown): GoogleRealtimeThinkingLevel | undefine
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function asActivityHandling(value: unknown): GoogleRealtimeActivityHandling | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
|
||||
switch (normalized) {
|
||||
case "start-of-activity-interrupts":
|
||||
case "start-of-activity-interrupt":
|
||||
case "interrupt":
|
||||
case "interrupts":
|
||||
return "start-of-activity-interrupts";
|
||||
case "no-interruption":
|
||||
case "no-interruptions":
|
||||
case "none":
|
||||
return "no-interruption";
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function asTurnCoverage(value: unknown): GoogleRealtimeTurnCoverage | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
|
||||
switch (normalized) {
|
||||
case "only-activity":
|
||||
case "turn-includes-only-activity":
|
||||
return "only-activity";
|
||||
case "all-input":
|
||||
case "turn-includes-all-input":
|
||||
return "all-input";
|
||||
case "audio-activity-and-all-video":
|
||||
case "turn-includes-audio-activity-and-all-video":
|
||||
return "audio-activity-and-all-video";
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveGoogleRealtimeProviderConfigRecord(
|
||||
config: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
@@ -140,6 +184,9 @@ function normalizeProviderConfig(
|
||||
silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
|
||||
startSensitivity: asSensitivity(raw?.startSensitivity),
|
||||
endSensitivity: asSensitivity(raw?.endSensitivity),
|
||||
activityHandling: asActivityHandling(raw?.activityHandling),
|
||||
turnCoverage: asTurnCoverage(raw?.turnCoverage),
|
||||
automaticActivityDetectionDisabled: asBoolean(raw?.automaticActivityDetectionDisabled),
|
||||
enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
|
||||
thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
|
||||
thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
|
||||
@@ -176,6 +223,32 @@ function mapEndSensitivity(
|
||||
}
|
||||
}
|
||||
|
||||
function mapActivityHandling(
|
||||
value: GoogleRealtimeActivityHandling | undefined,
|
||||
): ActivityHandling | undefined {
|
||||
switch (value) {
|
||||
case "no-interruption":
|
||||
return ActivityHandling.NO_INTERRUPTION;
|
||||
case "start-of-activity-interrupts":
|
||||
return ActivityHandling.START_OF_ACTIVITY_INTERRUPTS;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCoverage | undefined {
|
||||
switch (value) {
|
||||
case "only-activity":
|
||||
return TurnCoverage.TURN_INCLUDES_ONLY_ACTIVITY;
|
||||
case "all-input":
|
||||
return TurnCoverage.TURN_INCLUDES_ALL_INPUT;
|
||||
case "audio-activity-and-all-video":
|
||||
return TurnCoverage.TURN_INCLUDES_AUDIO_ACTIVITY_AND_ALL_VIDEO;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function buildThinkingConfig(config: GoogleRealtimeVoiceBridgeConfig): ThinkingConfig | undefined {
|
||||
if (config.thinkingLevel) {
|
||||
return { thinkingLevel: config.thinkingLevel.toUpperCase() as ThinkingConfig["thinkingLevel"] };
|
||||
@@ -191,7 +264,12 @@ function buildRealtimeInputConfig(
|
||||
): RealtimeInputConfig | undefined {
|
||||
const startSensitivity = mapStartSensitivity(config.startSensitivity);
|
||||
const endSensitivity = mapEndSensitivity(config.endSensitivity);
|
||||
const activityHandling = mapActivityHandling(config.activityHandling);
|
||||
const turnCoverage = mapTurnCoverage(config.turnCoverage);
|
||||
const automaticActivityDetection = {
|
||||
...(typeof config.automaticActivityDetectionDisabled === "boolean"
|
||||
? { disabled: config.automaticActivityDetectionDisabled }
|
||||
: {}),
|
||||
...(startSensitivity ? { startOfSpeechSensitivity: startSensitivity } : {}),
|
||||
...(endSensitivity ? { endOfSpeechSensitivity: endSensitivity } : {}),
|
||||
...(typeof config.prefixPaddingMs === "number"
|
||||
@@ -201,9 +279,12 @@ function buildRealtimeInputConfig(
|
||||
? { silenceDurationMs: Math.max(0, Math.floor(config.silenceDurationMs)) }
|
||||
: {}),
|
||||
};
|
||||
return Object.keys(automaticActivityDetection).length > 0
|
||||
? { automaticActivityDetection }
|
||||
: undefined;
|
||||
const realtimeInputConfig = {
|
||||
...(Object.keys(automaticActivityDetection).length > 0 ? { automaticActivityDetection } : {}),
|
||||
...(activityHandling ? { activityHandling } : {}),
|
||||
...(turnCoverage ? { turnCoverage } : {}),
|
||||
};
|
||||
return Object.keys(realtimeInputConfig).length > 0 ? realtimeInputConfig : undefined;
|
||||
}
|
||||
|
||||
function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): FunctionDeclaration[] {
|
||||
@@ -519,6 +600,9 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
|
||||
silenceDurationMs: config.silenceDurationMs,
|
||||
startSensitivity: config.startSensitivity,
|
||||
endSensitivity: config.endSensitivity,
|
||||
activityHandling: config.activityHandling,
|
||||
turnCoverage: config.turnCoverage,
|
||||
automaticActivityDetectionDisabled: config.automaticActivityDetectionDisabled,
|
||||
enableAffectiveDialog: config.enableAffectiveDialog,
|
||||
thinkingLevel: config.thinkingLevel,
|
||||
thinkingBudget: config.thinkingBudget,
|
||||
|
||||
Reference in New Issue
Block a user