mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:30:42 +00:00
fix: attach Google Meet realtime bridge
This commit is contained in:
@@ -355,6 +355,8 @@ Defaults:
|
||||
- `realtime.toolPolicy: "safe-read-only"`
|
||||
- `realtime.instructions`: brief spoken replies, with
|
||||
`openclaw_agent_consult` for deeper answers
|
||||
- `realtime.introMessage`: short spoken readiness check when the realtime bridge
|
||||
connects; set it to `""` to join silently
|
||||
|
||||
Optional overrides:
|
||||
|
||||
@@ -371,6 +373,7 @@ Optional overrides:
|
||||
},
|
||||
realtime: {
|
||||
toolPolicy: "owner",
|
||||
introMessage: "Say exactly: I'm here.",
|
||||
},
|
||||
}
|
||||
```
|
||||
@@ -409,7 +412,16 @@ VM. In both cases the realtime model and `openclaw_agent_consult` run on the
|
||||
Gateway host, so model credentials stay there.
|
||||
|
||||
Use `action: "status"` to list active sessions or inspect a session ID. Use
|
||||
`action: "leave"` to mark a session ended.
|
||||
`action: "speak"` with `sessionId` and `message` to make the realtime agent
|
||||
speak immediately. Use `action: "leave"` to mark a session ended.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "speak",
|
||||
"sessionId": "meet_...",
|
||||
"message": "Say exactly: I'm here and listening."
|
||||
}
|
||||
```
|
||||
|
||||
## Realtime agent consult
|
||||
|
||||
@@ -434,6 +446,12 @@ voice session. The voice model can then speak that answer back into the meeting.
|
||||
The consult session key is scoped per Meet session, so follow-up consult calls
|
||||
can reuse prior consult context during the same meeting.
|
||||
|
||||
To force a spoken readiness check after Chrome has fully joined the call:
|
||||
|
||||
```bash
|
||||
openclaw googlemeet speak meet_... "Say exactly: I'm here and listening."
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
Google Meet's official media API is receive-oriented, so speaking into a Meet
|
||||
@@ -453,9 +471,9 @@ For clean duplex audio, route Meet output and Meet microphone through separate
|
||||
virtual devices or a Loopback-style virtual device graph. A single shared
|
||||
BlackHole device can echo other participants back into the call.
|
||||
|
||||
`googlemeet leave` stops the command-pair realtime audio bridge for Chrome
|
||||
sessions. For Twilio sessions delegated through the Voice Call plugin, it also
|
||||
hangs up the underlying voice call.
|
||||
`googlemeet speak` triggers the active realtime audio bridge for a Chrome
|
||||
session. `googlemeet leave` stops that bridge. For Twilio sessions delegated
|
||||
through the Voice Call plugin, `leave` also hangs up the underlying voice call.
|
||||
|
||||
## Related
|
||||
|
||||
|
||||
@@ -205,6 +205,7 @@ describe("google-meet plugin", () => {
|
||||
voiceCall: { enabled: true, requestTimeoutMs: 30000, dtmfDelayMs: 2500 },
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
introMessage: "Say exactly: I'm here and listening.",
|
||||
toolPolicy: "safe-read-only",
|
||||
},
|
||||
oauth: {},
|
||||
@@ -284,7 +285,7 @@ describe("google-meet plugin", () => {
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave"],
|
||||
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave", "speak"],
|
||||
},
|
||||
transport: { type: "string", enum: ["chrome", "chrome-node", "twilio"] },
|
||||
mode: { type: "string", enum: ["realtime", "transcribe"] },
|
||||
@@ -520,11 +521,16 @@ describe("google-meet plugin", () => {
|
||||
});
|
||||
|
||||
it("joins Chrome on a paired node without local Chrome or BlackHole", async () => {
|
||||
const { methods, nodesList, nodesInvoke } = setup({
|
||||
defaultTransport: "chrome-node",
|
||||
defaultMode: "transcribe",
|
||||
chromeNode: { node: "parallels-macos" },
|
||||
});
|
||||
const { methods, nodesList, nodesInvoke } = setup(
|
||||
{
|
||||
defaultTransport: "chrome-node",
|
||||
defaultMode: "transcribe",
|
||||
chromeNode: { node: "parallels-macos" },
|
||||
},
|
||||
{
|
||||
nodesInvokeResult: { payload: { launched: true } },
|
||||
},
|
||||
);
|
||||
const handler = methods.get("googlemeet.join") as
|
||||
| ((ctx: {
|
||||
params: Record<string, unknown>;
|
||||
@@ -669,6 +675,7 @@ describe("google-meet plugin", () => {
|
||||
name: string;
|
||||
args: unknown;
|
||||
}) => void;
|
||||
onReady?: () => void;
|
||||
tools?: unknown[];
|
||||
}
|
||||
| undefined;
|
||||
@@ -680,6 +687,7 @@ describe("google-meet plugin", () => {
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
triggerGreeting: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
@@ -756,6 +764,7 @@ describe("google-meet plugin", () => {
|
||||
inputStdout.write(Buffer.from([1, 2, 3]));
|
||||
callbacks?.onAudio(Buffer.from([4, 5]));
|
||||
callbacks?.onMark?.("mark-1");
|
||||
callbacks?.onReady?.();
|
||||
callbacks?.onToolCall?.({
|
||||
itemId: "item-1",
|
||||
callId: "tool-call-1",
|
||||
@@ -772,6 +781,9 @@ describe("google-meet plugin", () => {
|
||||
expect(sendAudio).toHaveBeenCalledWith(Buffer.from([1, 2, 3]));
|
||||
expect(outputStdinWrites).toEqual([Buffer.from([4, 5])]);
|
||||
expect(bridge.acknowledgeMark).toHaveBeenCalled();
|
||||
expect(bridge.triggerGreeting).toHaveBeenCalledWith("Say exactly: I'm here and listening.");
|
||||
handle.speak("Say exactly: hello from the meeting.");
|
||||
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the meeting.");
|
||||
expect(callbacks).toMatchObject({
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
@@ -808,6 +820,7 @@ describe("google-meet plugin", () => {
|
||||
name: string;
|
||||
args: unknown;
|
||||
}) => void;
|
||||
onReady?: () => void;
|
||||
tools?: unknown[];
|
||||
}
|
||||
| undefined;
|
||||
@@ -819,6 +832,7 @@ describe("google-meet plugin", () => {
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
triggerGreeting: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
@@ -879,6 +893,7 @@ describe("google-meet plugin", () => {
|
||||
});
|
||||
|
||||
callbacks?.onAudio(Buffer.from([1, 2, 3]));
|
||||
callbacks?.onReady?.();
|
||||
callbacks?.onToolCall?.({
|
||||
itemId: "item-1",
|
||||
callId: "tool-call-1",
|
||||
@@ -907,6 +922,9 @@ describe("google-meet plugin", () => {
|
||||
text: "Use the launch update.",
|
||||
});
|
||||
});
|
||||
expect(bridge.triggerGreeting).toHaveBeenCalledWith("Say exactly: I'm here and listening.");
|
||||
handle.speak("Say exactly: hello from the node.");
|
||||
expect(bridge.triggerGreeting).toHaveBeenLastCalledWith("Say exactly: hello from the node.");
|
||||
expect(callbacks).toMatchObject({
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
|
||||
@@ -88,6 +88,10 @@ const googleMeetConfigSchema = {
|
||||
},
|
||||
"realtime.model": { label: "Realtime Model", advanced: true },
|
||||
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
|
||||
"realtime.introMessage": {
|
||||
label: "Realtime Intro Message",
|
||||
help: "Spoken once when the realtime bridge is ready. Set to an empty string to join silently.",
|
||||
},
|
||||
"realtime.toolPolicy": {
|
||||
label: "Realtime Tool Policy",
|
||||
help: "Safe read-only tools are available by default; owner requests can unlock broader tools.",
|
||||
@@ -111,7 +115,7 @@ const googleMeetConfigSchema = {
|
||||
|
||||
const GoogleMeetToolSchema = Type.Object({
|
||||
action: Type.String({
|
||||
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave"],
|
||||
enum: ["join", "status", "setup_status", "resolve_space", "preflight", "leave", "speak"],
|
||||
description: "Google Meet action to run",
|
||||
}),
|
||||
url: Type.Optional(Type.String({ description: "Explicit https://meet.google.com/... URL" })),
|
||||
@@ -123,6 +127,7 @@ const GoogleMeetToolSchema = Type.Object({
|
||||
pin: Type.Optional(Type.String({ description: "Meet phone PIN for Twilio" })),
|
||||
dtmfSequence: Type.Optional(Type.String({ description: "Explicit DTMF sequence for Twilio" })),
|
||||
sessionId: Type.Optional(Type.String({ description: "Meet session ID" })),
|
||||
message: Type.Optional(Type.String({ description: "Realtime instructions to speak now" })),
|
||||
meeting: Type.Optional(Type.String({ description: "Meet URL, meeting code, or spaces/{id}" })),
|
||||
accessToken: Type.Optional(Type.String({ description: "Access token override" })),
|
||||
refreshToken: Type.Optional(Type.String({ description: "Refresh token override" })),
|
||||
@@ -265,6 +270,23 @@ export default definePluginEntry({
|
||||
},
|
||||
);
|
||||
|
||||
api.registerGatewayMethod(
|
||||
"googlemeet.speak",
|
||||
async ({ params, respond }: GatewayRequestHandlerOptions) => {
|
||||
try {
|
||||
const sessionId = normalizeOptionalString(params?.sessionId);
|
||||
if (!sessionId) {
|
||||
respond(false, { error: "sessionId required" });
|
||||
return;
|
||||
}
|
||||
const rt = await ensureRuntime();
|
||||
respond(true, rt.speak(sessionId, normalizeOptionalString(params?.message)));
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
api.registerTool({
|
||||
name: "google_meet",
|
||||
label: "Google Meet",
|
||||
@@ -318,6 +340,14 @@ export default definePluginEntry({
|
||||
}
|
||||
return json(await rt.leave(sessionId));
|
||||
}
|
||||
case "speak": {
|
||||
const rt = await ensureRuntime();
|
||||
const sessionId = normalizeOptionalString(raw.sessionId);
|
||||
if (!sessionId) {
|
||||
throw new Error("sessionId required");
|
||||
}
|
||||
return json(rt.speak(sessionId, normalizeOptionalString(raw.message)));
|
||||
}
|
||||
default:
|
||||
throw new Error("unknown google_meet action");
|
||||
}
|
||||
|
||||
@@ -108,6 +108,10 @@
|
||||
"label": "Realtime Instructions",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.introMessage": {
|
||||
"label": "Realtime Intro Message",
|
||||
"help": "Spoken once when the realtime bridge is ready. Set to an empty string to join silently."
|
||||
},
|
||||
"realtime.toolPolicy": {
|
||||
"label": "Realtime Tool Policy",
|
||||
"help": "Safe read-only tools are available by default; owner requests can unlock broader tools.",
|
||||
@@ -312,6 +316,10 @@
|
||||
"type": "string",
|
||||
"default": "You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering."
|
||||
},
|
||||
"introMessage": {
|
||||
"type": "string",
|
||||
"default": "Say exactly: I'm here and listening."
|
||||
},
|
||||
"toolPolicy": {
|
||||
"type": "string",
|
||||
"enum": ["safe-read-only", "owner", "none"],
|
||||
|
||||
@@ -304,4 +304,20 @@ export function registerGoogleMeetCli(params: {
|
||||
}
|
||||
writeStdoutLine("left %s", sessionId);
|
||||
});
|
||||
|
||||
root
|
||||
.command("speak")
|
||||
.argument("<session-id>", "Meet session ID")
|
||||
.argument("[message]", "Realtime instructions to speak now")
|
||||
.action(async (sessionId: string, message?: string) => {
|
||||
const rt = await params.ensureRuntime();
|
||||
const result = rt.speak(sessionId, message);
|
||||
if (!result.found) {
|
||||
throw new Error("session not found");
|
||||
}
|
||||
if (!result.spoken) {
|
||||
throw new Error("session has no active realtime audio bridge");
|
||||
}
|
||||
writeStdoutLine("speaking on %s", sessionId);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -48,6 +48,7 @@ export type GoogleMeetConfig = {
|
||||
provider?: string;
|
||||
model?: string;
|
||||
instructions?: string;
|
||||
introMessage?: string;
|
||||
toolPolicy: GoogleMeetToolPolicy;
|
||||
providers: Record<string, Record<string, unknown>>;
|
||||
};
|
||||
@@ -99,6 +100,7 @@ export const DEFAULT_GOOGLE_MEET_AUDIO_OUTPUT_COMMAND = [
|
||||
] as const;
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
|
||||
export const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening.";
|
||||
|
||||
export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
enabled: true,
|
||||
@@ -125,6 +127,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
instructions: DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS,
|
||||
introMessage: DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE,
|
||||
toolPolicy: "safe-read-only",
|
||||
providers: {},
|
||||
},
|
||||
@@ -339,6 +342,9 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
instructions:
|
||||
normalizeOptionalString(realtime.instructions) ??
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.realtime.instructions,
|
||||
introMessage:
|
||||
normalizeOptionalString(realtime.introMessage) ??
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.realtime.introMessage,
|
||||
toolPolicy: resolveToolPolicy(
|
||||
realtime.toolPolicy,
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.realtime.toolPolicy,
|
||||
|
||||
@@ -19,6 +19,7 @@ export type ChromeNodeRealtimeAudioBridgeHandle = {
|
||||
providerId: string;
|
||||
nodeId: string;
|
||||
bridgeId: string;
|
||||
speak: (instructions?: string) => void;
|
||||
stop: () => Promise<void>;
|
||||
};
|
||||
|
||||
@@ -81,6 +82,8 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
provider: resolved.provider,
|
||||
providerConfig: resolved.providerConfig,
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
triggerGreetingOnReady: Boolean(params.config.realtime.introMessage),
|
||||
markStrategy: "ack-immediately",
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
audioSink: {
|
||||
@@ -188,6 +191,9 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
providerId: resolved.provider.id,
|
||||
nodeId: params.nodeId,
|
||||
bridgeId: params.bridgeId,
|
||||
speak: (instructions) => {
|
||||
bridge?.triggerGreeting(instructions);
|
||||
},
|
||||
stop,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@ export type ChromeRealtimeAudioBridgeHandle = {
|
||||
providerId: string;
|
||||
inputCommand: string[];
|
||||
outputCommand: string[];
|
||||
speak: (instructions?: string) => void;
|
||||
stop: () => Promise<void>;
|
||||
};
|
||||
|
||||
@@ -148,6 +149,8 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
provider: resolved.provider,
|
||||
providerConfig: resolved.providerConfig,
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
triggerGreetingOnReady: Boolean(params.config.realtime.introMessage),
|
||||
markStrategy: "ack-immediately",
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
audioSink: {
|
||||
@@ -210,6 +213,9 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
providerId: resolved.provider.id,
|
||||
inputCommand: params.inputCommand,
|
||||
outputCommand: params.outputCommand,
|
||||
speak: (instructions) => {
|
||||
bridge?.triggerGreeting(instructions);
|
||||
},
|
||||
stop,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ function resolveMode(input: GoogleMeetMode | undefined, config: GoogleMeetConfig
|
||||
export class GoogleMeetRuntime {
|
||||
readonly #sessions = new Map<string, GoogleMeetSession>();
|
||||
readonly #sessionStops = new Map<string, () => Promise<void>>();
|
||||
readonly #sessionSpeakers = new Map<string, (instructions?: string) => void>();
|
||||
|
||||
constructor(
|
||||
private readonly params: {
|
||||
@@ -151,6 +152,7 @@ export class GoogleMeetRuntime {
|
||||
result.audioBridge?.type === "node-command-pair"
|
||||
) {
|
||||
this.#sessionStops.set(session.id, result.audioBridge.stop);
|
||||
this.#sessionSpeakers.set(session.id, result.audioBridge.speak);
|
||||
}
|
||||
session.notes.push(
|
||||
result.audioBridge
|
||||
@@ -215,10 +217,28 @@ export class GoogleMeetRuntime {
|
||||
const stop = this.#sessionStops.get(sessionId);
|
||||
if (stop) {
|
||||
this.#sessionStops.delete(sessionId);
|
||||
this.#sessionSpeakers.delete(sessionId);
|
||||
await stop();
|
||||
}
|
||||
session.state = "ended";
|
||||
session.updatedAt = nowIso();
|
||||
return { found: true, session };
|
||||
}
|
||||
|
||||
speak(
|
||||
sessionId: string,
|
||||
instructions?: string,
|
||||
): { found: boolean; spoken: boolean; session?: GoogleMeetSession } {
|
||||
const session = this.#sessions.get(sessionId);
|
||||
if (!session) {
|
||||
return { found: false, spoken: false };
|
||||
}
|
||||
const speak = this.#sessionSpeakers.get(sessionId);
|
||||
if (!speak || session.state !== "active") {
|
||||
return { found: true, spoken: false, session };
|
||||
}
|
||||
speak(instructions || this.params.config.realtime.introMessage);
|
||||
session.updatedAt = nowIso();
|
||||
return { found: true, spoken: true, session };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,10 +201,14 @@ function parseNodeStartResult(raw: unknown): {
|
||||
bridgeId?: string;
|
||||
audioBridge?: { type?: string };
|
||||
} {
|
||||
if (!raw || typeof raw !== "object") {
|
||||
const value =
|
||||
raw && typeof raw === "object" && "payload" in raw
|
||||
? (raw as { payload?: unknown }).payload
|
||||
: raw;
|
||||
if (!value || typeof value !== "object") {
|
||||
throw new Error("Google Meet node returned an invalid start result.");
|
||||
}
|
||||
return raw as {
|
||||
return value as {
|
||||
launched?: boolean;
|
||||
bridgeId?: string;
|
||||
audioBridge?: { type?: string };
|
||||
|
||||
Reference in New Issue
Block a user