fix: attach Google Meet realtime bridge

This commit is contained in:
Peter Steinberger
2026-04-24 09:41:23 +01:00
parent b5e5f2cede
commit 56fe2aab9c
10 changed files with 145 additions and 13 deletions

View File

@@ -355,6 +355,8 @@ Defaults:
- `realtime.toolPolicy: "safe-read-only"`
- `realtime.instructions`: brief spoken replies, with
`openclaw_agent_consult` for deeper answers
- `realtime.introMessage`: short spoken readiness check when the realtime bridge
connects; set it to `""` to join silently
Optional overrides:
@@ -371,6 +373,7 @@ Optional overrides:
},
realtime: {
toolPolicy: "owner",
introMessage: "Say exactly: I'm here.",
},
}
```
@@ -409,7 +412,16 @@ VM. In both cases the realtime model and `openclaw_agent_consult` run on the
Gateway host, so model credentials stay there.
Use `action: "status"` to list active sessions or inspect a session ID. Use
`action: "leave"` to mark a session ended.
`action: "speak"` with `sessionId` and `message` to make the realtime agent
speak immediately. Use `action: "leave"` to mark a session ended.
```json
{
"action": "speak",
"sessionId": "meet_...",
"message": "Say exactly: I'm here and listening."
}
```
## Realtime agent consult
@@ -434,6 +446,12 @@ voice session. The voice model can then speak that answer back into the meeting.
The consult session key is scoped per Meet session, so follow-up consult calls
can reuse prior consult context during the same meeting.
To force a spoken readiness check after Chrome has fully joined the call:
```bash
openclaw googlemeet speak meet_... "Say exactly: I'm here and listening."
```
## Notes
Google Meet's official media API is receive-oriented, so speaking into a Meet
@@ -453,9 +471,9 @@ For clean duplex audio, route Meet output and Meet microphone through separate
virtual devices or a Loopback-style virtual device graph. A single shared
BlackHole device can echo other participants back into the call.
`googlemeet leave` stops the command-pair realtime audio bridge for Chrome
sessions. For Twilio sessions delegated through the Voice Call plugin, it also
hangs up the underlying voice call.
`googlemeet speak` triggers the active realtime audio bridge for a Chrome
session. `googlemeet leave` stops that bridge. For Twilio sessions delegated
through the Voice Call plugin, `leave` also hangs up the underlying voice call.
## Related

View File

@@ -205,6 +205,7 @@ describe("google-meet plugin", () => {
voiceCall: { enabled: true, requestTimeoutMs: 30000, dtmfDelayMs: 2500 },
realtime: {
provider: "openai",
introMessage: "Say exactly: I'm here and listening.",
toolPolicy: "safe-read-only",
},
oauth: {},
@@ -284,7 +285,7 @@ describe("google-meet plugin", () => {
properties: {
action: {
type: "string",
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave"],
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave", "speak"],
},
transport: { type: "string", enum: ["chrome", "chrome-node", "twilio"] },
mode: { type: "string", enum: ["realtime", "transcribe"] },
@@ -520,11 +521,16 @@ describe("google-meet plugin", () => {
});
it("joins Chrome on a paired node without local Chrome or BlackHole", async () => {
const { methods, nodesList, nodesInvoke } = setup({
defaultTransport: "chrome-node",
defaultMode: "transcribe",
chromeNode: { node: "parallels-macos" },
});
const { methods, nodesList, nodesInvoke } = setup(
{
defaultTransport: "chrome-node",
defaultMode: "transcribe",
chromeNode: { node: "parallels-macos" },
},
{
nodesInvokeResult: { payload: { launched: true } },
},
);
const handler = methods.get("googlemeet.join") as
| ((ctx: {
params: Record<string, unknown>;
@@ -669,6 +675,7 @@ describe("google-meet plugin", () => {
name: string;
args: unknown;
}) => void;
onReady?: () => void;
tools?: unknown[];
}
| undefined;
@@ -680,6 +687,7 @@ describe("google-meet plugin", () => {
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
triggerGreeting: vi.fn(),
isConnected: vi.fn(() => true),
};
const provider: RealtimeVoiceProviderPlugin = {
@@ -756,6 +764,7 @@ describe("google-meet plugin", () => {
inputStdout.write(Buffer.from([1, 2, 3]));
callbacks?.onAudio(Buffer.from([4, 5]));
callbacks?.onMark?.("mark-1");
callbacks?.onReady?.();
callbacks?.onToolCall?.({
itemId: "item-1",
callId: "tool-call-1",
@@ -772,6 +781,9 @@ describe("google-meet plugin", () => {
expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2, 3]));
expect(outputStdinWrites).toEqual([Buffer.from([4, 5])]);
expect(bridge.acknowledgeMark).toHaveBeenCalled();
expect(bridge.triggerGreeting).toHaveBeenCalledWith("Say exactly: I'm here and listening.");
handle.speak("Say exactly: hello from the meeting.");
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the meeting.");
expect(callbacks).toMatchObject({
tools: [
expect.objectContaining({
@@ -808,6 +820,7 @@ describe("google-meet plugin", () => {
name: string;
args: unknown;
}) => void;
onReady?: () => void;
tools?: unknown[];
}
| undefined;
@@ -819,6 +832,7 @@ describe("google-meet plugin", () => {
submitToolResult: vi.fn(),
acknowledgeMark: vi.fn(),
close: vi.fn(),
triggerGreeting: vi.fn(),
isConnected: vi.fn(() => true),
};
const provider: RealtimeVoiceProviderPlugin = {
@@ -879,6 +893,7 @@ describe("google-meet plugin", () => {
});
callbacks?.onAudio(Buffer.from([1, 2, 3]));
callbacks?.onReady?.();
callbacks?.onToolCall?.({
itemId: "item-1",
callId: "tool-call-1",
@@ -907,6 +922,9 @@ describe("google-meet plugin", () => {
text: "Use the launch update.",
});
});
expect(bridge.triggerGreeting).toHaveBeenCalledWith("Say exactly: I'm here and listening.");
handle.speak("Say exactly: hello from the node.");
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the node.");
expect(callbacks).toMatchObject({
tools: [
expect.objectContaining({

View File

@@ -88,6 +88,10 @@ const googleMeetConfigSchema = {
},
"realtime.model": { label: "Realtime Model", advanced: true },
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
"realtime.introMessage": {
label: "Realtime Intro Message",
help: "Spoken once when the realtime bridge is ready. Set to an empty string to join silently.",
},
"realtime.toolPolicy": {
label: "Realtime Tool Policy",
help: "Safe read-only tools are available by default; owner requests can unlock broader tools.",
@@ -111,7 +115,7 @@ const googleMeetConfigSchema = {
const GoogleMeetToolSchema = Type.Object({
action: Type.String({
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave"],
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave", "speak"],
description: "Google Meet action to run",
}),
url: Type.Optional(Type.String({ description: "Explicit https://meet.google.com/... URL" })),
@@ -123,6 +127,7 @@ const GoogleMeetToolSchema = Type.Object({
pin: Type.Optional(Type.String({ description: "Meet phone PIN for Twilio" })),
dtmfSequence: Type.Optional(Type.String({ description: "Explicit DTMF sequence for Twilio" })),
sessionId: Type.Optional(Type.String({ description: "Meet session ID" })),
message: Type.Optional(Type.String({ description: "Realtime instructions to speak now" })),
meeting: Type.Optional(Type.String({ description: "Meet URL, meeting code, or spaces/{id}" })),
accessToken: Type.Optional(Type.String({ description: "Access token override" })),
refreshToken: Type.Optional(Type.String({ description: "Refresh token override" })),
@@ -265,6 +270,23 @@ export default definePluginEntry({
},
);
api.registerGatewayMethod(
"googlemeet.speak",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
const sessionId = normalizeOptionalString(params?.sessionId);
if (!sessionId) {
respond(false, { error: "sessionId required" });
return;
}
const rt = await ensureRuntime();
respond(true, rt.speak(sessionId, normalizeOptionalString(params?.message)));
} catch (err) {
sendError(respond, err);
}
},
);
api.registerTool({
name: "google_meet",
label: "Google Meet",
@@ -318,6 +340,14 @@ export default definePluginEntry({
}
return json(await rt.leave(sessionId));
}
case "speak": {
const rt = await ensureRuntime();
const sessionId = normalizeOptionalString(raw.sessionId);
if (!sessionId) {
throw new Error("sessionId required");
}
return json(rt.speak(sessionId, normalizeOptionalString(raw.message)));
}
default:
throw new Error("unknown google_meet action");
}

View File

@@ -108,6 +108,10 @@
"label": "Realtime Instructions",
"advanced": true
},
"realtime.introMessage": {
"label": "Realtime Intro Message",
"help": "Spoken once when the realtime bridge is ready. Set to an empty string to join silently."
},
"realtime.toolPolicy": {
"label": "Realtime Tool Policy",
"help": "Safe read-only tools are available by default; owner requests can unlock broader tools.",
@@ -312,6 +316,10 @@
"type": "string",
"default": "You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering."
},
"introMessage": {
"type": "string",
"default": "Say exactly: I'm here and listening."
},
"toolPolicy": {
"type": "string",
"enum": ["safe-read-only", "owner", "none"],

View File

@@ -304,4 +304,20 @@ export function registerGoogleMeetCli(params: {
}
writeStdoutLine("left %s", sessionId);
});
root
.command("speak")
.argument("<session-id>", "Meet session ID")
.argument("[message]", "Realtime instructions to speak now")
.action(async (sessionId: string, message?: string) => {
const rt = await params.ensureRuntime();
const result = rt.speak(sessionId, message);
if (!result.found) {
throw new Error("session not found");
}
if (!result.spoken) {
throw new Error("session has no active realtime audio bridge");
}
writeStdoutLine("speaking on %s", sessionId);
});
}

View File

@@ -48,6 +48,7 @@ export type GoogleMeetConfig = {
provider?: string;
model?: string;
instructions?: string;
introMessage?: string;
toolPolicy: GoogleMeetToolPolicy;
providers: Record<string, Record<string, unknown>>;
};
@@ -99,6 +100,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
] as const;
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening.";
export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
enabled: true,
@@ -125,6 +127,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
realtime: {
provider: "openai",
instructions: DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS,
introMessage: DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE,
toolPolicy: "safe-read-only",
providers: {},
},
@@ -339,6 +342,9 @@ export function resolveGoogleMeetConfigWithEnv(
instructions:
normalizeOptionalString(realtime.instructions) ??
DEFAULT_GOOGLE_MEET_CONFIG.realtime.instructions,
introMessage:
normalizeOptionalString(realtime.introMessage) ??
DEFAULT_GOOGLE_MEET_CONFIG.realtime.introMessage,
toolPolicy: resolveToolPolicy(
realtime.toolPolicy,
DEFAULT_GOOGLE_MEET_CONFIG.realtime.toolPolicy,

View File

@@ -19,6 +19,7 @@ export type ChromeNodeRealtimeAudioBridgeHandle = {
providerId: string;
nodeId: string;
bridgeId: string;
speak: (instructions?: string) => void;
stop: () => Promise<void>;
};
@@ -81,6 +82,8 @@ export async function startNodeRealtimeAudioBridge(params: {
provider: resolved.provider,
providerConfig: resolved.providerConfig,
instructions: params.config.realtime.instructions,
initialGreetingInstructions: params.config.realtime.introMessage,
triggerGreetingOnReady: Boolean(params.config.realtime.introMessage),
markStrategy: "ack-immediately",
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
audioSink: {
@@ -188,6 +191,9 @@ export async function startNodeRealtimeAudioBridge(params: {
providerId: resolved.provider.id,
nodeId: params.nodeId,
bridgeId: params.bridgeId,
speak: (instructions) => {
bridge?.triggerGreeting(instructions);
},
stop,
};
}

View File

@@ -41,6 +41,7 @@ export type ChromeRealtimeAudioBridgeHandle = {
providerId: string;
inputCommand: string[];
outputCommand: string[];
speak: (instructions?: string) => void;
stop: () => Promise<void>;
};
@@ -148,6 +149,8 @@ export async function startCommandRealtimeAudioBridge(params: {
provider: resolved.provider,
providerConfig: resolved.providerConfig,
instructions: params.config.realtime.instructions,
initialGreetingInstructions: params.config.realtime.introMessage,
triggerGreetingOnReady: Boolean(params.config.realtime.introMessage),
markStrategy: "ack-immediately",
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
audioSink: {
@@ -210,6 +213,9 @@ export async function startCommandRealtimeAudioBridge(params: {
providerId: resolved.provider.id,
inputCommand: params.inputCommand,
outputCommand: params.outputCommand,
speak: (instructions) => {
bridge?.triggerGreeting(instructions);
},
stop,
};
}

View File

@@ -49,6 +49,7 @@ function resolveMode(input: GoogleMeetMode | undefined, config: GoogleMeetConfig
export class GoogleMeetRuntime {
readonly #sessions = new Map<string, GoogleMeetSession>();
readonly #sessionStops = new Map<string, () => Promise<void>>();
readonly #sessionSpeakers = new Map<string, (instructions?: string) => void>();
constructor(
private readonly params: {
@@ -151,6 +152,7 @@ export class GoogleMeetRuntime {
result.audioBridge?.type === "node-command-pair"
) {
this.#sessionStops.set(session.id, result.audioBridge.stop);
this.#sessionSpeakers.set(session.id, result.audioBridge.speak);
}
session.notes.push(
result.audioBridge
@@ -215,10 +217,28 @@ export class GoogleMeetRuntime {
const stop = this.#sessionStops.get(sessionId);
if (stop) {
this.#sessionStops.delete(sessionId);
this.#sessionSpeakers.delete(sessionId);
await stop();
}
session.state = "ended";
session.updatedAt = nowIso();
return { found: true, session };
}
speak(
sessionId: string,
instructions?: string,
): { found: boolean; spoken: boolean; session?: GoogleMeetSession } {
const session = this.#sessions.get(sessionId);
if (!session) {
return { found: false, spoken: false };
}
const speak = this.#sessionSpeakers.get(sessionId);
if (!speak || session.state !== "active") {
return { found: true, spoken: false, session };
}
speak(instructions || this.params.config.realtime.introMessage);
session.updatedAt = nowIso();
return { found: true, spoken: true, session };
}
}

View File

@@ -201,10 +201,14 @@ function parseNodeStartResult(raw: unknown): {
bridgeId?: string;
audioBridge?: { type?: string };
} {
if (!raw || typeof raw !== "object") {
const value =
raw && typeof raw === "object" && "payload" in raw
? (raw as { payload?: unknown }).payload
: raw;
if (!value || typeof value !== "object") {
throw new Error("Google Meet node returned an invalid start result.");
}
return raw as {
return value as {
launched?: boolean;
bridgeId?: string;
audioBridge?: { type?: string };