mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:50:43 +00:00
fix: stabilize Google Meet realtime talkback
This commit is contained in:
@@ -45,6 +45,7 @@ Docs: https://docs.openclaw.ai
|
||||
- QA/cache: require the full `CACHE-OK <suffix>` marker before live cache probes stop retrying, so suffix-only prose cannot hide a broken probe response. Thanks @vincentkoc.
|
||||
- Slack/Matrix: avoid creating blank progress-draft messages when `streaming.progress.label=false` and progress tool lines are disabled. Thanks @vincentkoc.
|
||||
- QA/Matrix: keep the mock OpenAI tool-progress provider aligned with exact-marker Matrix prompts so the hardened live preview scenario still forces a deterministic read before final delivery. Thanks @vincentkoc.
|
||||
- Google Meet: make realtime talk-back agent-driven by default with `realtime.strategy: "agent"`, keep the previous direct bidirectional model behavior available as `realtime.strategy: "bidi"`, route the Meet tab speaker output to `BlackHole 2ch` automatically for local Chrome realtime joins, coalesce nearby speech transcript fragments before consulting the agent, and avoid cutting off agent speech from server VAD or stale playback pipe errors.
|
||||
- OpenAI/Google Meet: wait for realtime voice `session.updated` before treating the bridge as connected, so Meet joins do not return with audio queued behind an unconfigured realtime session. Thanks @vincentkoc.
|
||||
- Plugins/catalog: merge official external catalog descriptors into partial package channel config metadata, so lagging WeCom/Yuanbao manifests keep their own schema while still exposing host-supplied labels and setup text. Thanks @vincentkoc.
|
||||
- Plugins/catalog: supplement lagging official external WeCom and Yuanbao npm manifests with channel config descriptors and declared tool contracts from the OpenClaw catalog, so trusted package sweeps no longer fail because external package metadata trails the host contract. Thanks @vincentkoc.
|
||||
|
||||
@@ -190,7 +190,7 @@ then share the returned `meetingUri`.
|
||||
```
|
||||
|
||||
For an observe-only/browser-control join, set `"mode": "transcribe"`. That does
|
||||
not start the duplex realtime model bridge, does not require BlackHole or SoX,
|
||||
not start the duplex realtime voice bridge, does not require BlackHole or SoX,
|
||||
and will not talk back into the meeting. Chrome joins in this mode also avoid
|
||||
OpenClaw's microphone/camera permission grant and avoid the Meet **Use
|
||||
microphone** path. If Meet shows an audio-choice interstitial, automation tries
|
||||
@@ -1027,6 +1027,12 @@ Defaults:
|
||||
interruption on `chrome.bargeInInputCommand`
|
||||
- `chrome.bargeInCooldownMs: 900`: minimum delay between repeated human
|
||||
interruption clears
|
||||
- `realtime.strategy: "agent"`: default. Participant speech is transcribed,
|
||||
sent to the configured OpenClaw agent in a per-meeting sub-agent session, and
|
||||
the returned answer is spoken back through the realtime provider.
|
||||
- `realtime.strategy: "bidi"`: direct bidirectional realtime model mode. The
|
||||
realtime provider answers participant speech directly and may call
|
||||
`openclaw_agent_consult` for deeper/tool-backed answers.
|
||||
- `realtime.provider: "openai"`
|
||||
- `realtime.toolPolicy: "safe-read-only"`
|
||||
- `realtime.instructions`: brief spoken replies, with
|
||||
@@ -1072,6 +1078,7 @@ Optional overrides:
|
||||
node: "parallels-macos",
|
||||
},
|
||||
realtime: {
|
||||
strategy: "agent",
|
||||
provider: "google",
|
||||
agentId: "jay",
|
||||
toolPolicy: "owner",
|
||||
@@ -1124,7 +1131,10 @@ Agents can use the `google_meet` tool:
|
||||
Use `transport: "chrome"` when Chrome runs on the Gateway host. Use
|
||||
`transport: "chrome-node"` when Chrome runs on a paired node such as a Parallels
|
||||
VM. In both cases the realtime model and `openclaw_agent_consult` run on the
|
||||
Gateway host, so model credentials stay there.
|
||||
Gateway host, so model credentials stay there. With the default
|
||||
`realtime.strategy: "agent"`, the realtime provider handles audio and
|
||||
transcription while the configured OpenClaw agent produces the spoken answer.
|
||||
With `realtime.strategy: "bidi"`, the realtime model answers directly.
|
||||
|
||||
Use `action: "status"` to list active sessions or inspect a session ID. Use
|
||||
`action: "speak"` with `sessionId` and `message` to make the realtime agent
|
||||
@@ -1149,6 +1159,8 @@ a session ended.
|
||||
not send the intro/test phrase into the audio bridge.
|
||||
- `providerConnected` / `realtimeReady`: realtime voice bridge state
|
||||
- `lastInputAt` / `lastOutputAt`: last audio seen from or sent to the bridge
|
||||
- `audioOutputRouted` / `audioOutputDeviceLabel`: whether the Meet tab's media
|
||||
output was actively routed to the BlackHole device used by the bridge
|
||||
- `lastSuppressedInputAt` / `suppressedInputBytes`: loopback input ignored while
|
||||
assistant playback is active
|
||||
|
||||
@@ -1164,8 +1176,20 @@ a session ended.
|
||||
|
||||
Chrome realtime mode is optimized for a live voice loop. The realtime voice
|
||||
provider hears the meeting audio and speaks through the configured audio bridge.
|
||||
When the realtime model needs deeper reasoning, current information, or normal
|
||||
OpenClaw tools, it can call `openclaw_agent_consult`.
|
||||
The default `realtime.strategy: "agent"` uses the realtime provider for audio
|
||||
I/O and transcription, but routes final participant transcripts through the
|
||||
configured OpenClaw agent before speaking. Set `realtime.strategy: "bidi"` when
|
||||
you want the realtime model to answer directly.
|
||||
Nearby final transcript fragments are coalesced before the consult so one spoken
|
||||
turn does not produce several stale partial answers.
|
||||
|
||||
| Strategy | Who decides the answer | Context behavior | Use when |
|
||||
| -------- | ----------------------------- | ------------------------------------------------------------------------------------ | ----------------------------------------------------- |
|
||||
| `agent` | The configured OpenClaw agent | Per-meeting sub-agent session plus normal agent policy, tools, workspace, and memory | You want "my agent is in the meeting" behavior |
|
||||
| `bidi` | The realtime voice model | Realtime session context, with optional `openclaw_agent_consult` calls | You want the lowest-latency conversational voice loop |
|
||||
|
||||
In `bidi` strategy, when the realtime model needs deeper reasoning, current
|
||||
information, or normal OpenClaw tools, it can call `openclaw_agent_consult`.
|
||||
|
||||
The consult tool runs the regular OpenClaw agent behind the scenes with recent
|
||||
meeting transcript context and returns a concise spoken answer to the realtime
|
||||
@@ -1176,6 +1200,10 @@ By default, consults run against the `main` agent. Set `realtime.agentId` when a
|
||||
Meet lane should consult a dedicated OpenClaw agent workspace, model defaults,
|
||||
tool policy, memory, and session history.
|
||||
|
||||
Agent strategy consults use a per-meeting `agent:<id>:subagent:google-meet:<session>`
|
||||
session key so follow-up questions keep meeting context while inheriting normal
|
||||
agent policy from the configured agent.
|
||||
|
||||
`realtime.toolPolicy` controls the consult run:
|
||||
|
||||
- `safe-read-only`: expose the consult tool and limit the regular agent to
|
||||
@@ -1414,7 +1442,8 @@ Also verify:
|
||||
- `BlackHole 2ch` is visible on the Chrome host.
|
||||
- `sox` exists on the Chrome host.
|
||||
- Meet microphone and speaker are routed through the virtual audio path used by
|
||||
OpenClaw.
|
||||
OpenClaw. `doctor` should show `meet output routed: yes` for local Chrome
|
||||
realtime joins.
|
||||
|
||||
`googlemeet doctor [session-id]` prints the session, node, in-call state,
|
||||
manual action reason, realtime provider connection, `realtimeReady`, audio
|
||||
@@ -1578,7 +1607,7 @@ phone dial-in participation.
|
||||
Chrome realtime mode needs `BlackHole 2ch` plus either:
|
||||
|
||||
- `chrome.audioInputCommand` plus `chrome.audioOutputCommand`: OpenClaw owns the
|
||||
realtime model bridge and pipes audio in `chrome.audioFormat` between those
|
||||
realtime voice bridge and pipes audio in `chrome.audioFormat` between those
|
||||
commands and the selected realtime voice provider. The default Chrome path is
|
||||
24 kHz PCM16; 8 kHz G.711 mu-law remains available for legacy command pairs.
|
||||
- `chrome.audioBridgeCommand`: an external bridge command owns the whole local
|
||||
|
||||
@@ -371,6 +371,7 @@ describe("google-meet plugin", () => {
|
||||
postDtmfSpeechDelayMs: 5000,
|
||||
},
|
||||
realtime: {
|
||||
strategy: "agent",
|
||||
provider: "openai",
|
||||
introMessage: "Say exactly: I'm here and listening.",
|
||||
toolPolicy: "safe-read-only",
|
||||
@@ -2253,7 +2254,7 @@ describe("google-meet plugin", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("retries caption enable until the captions button is available", () => {
|
||||
it("retries caption enable until the captions button is available", async () => {
|
||||
const makeButton = (label: string) => ({
|
||||
disabled: false,
|
||||
innerText: "",
|
||||
@@ -2302,23 +2303,23 @@ describe("google-meet plugin", () => {
|
||||
captureCaptions: true,
|
||||
guestName: "OpenClaw Agent",
|
||||
})})`,
|
||||
).runInContext(context) as () => string;
|
||||
).runInContext(context) as () => string | Promise<string>;
|
||||
|
||||
const first = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean };
|
||||
const first = JSON.parse(await inspect()) as { captionsEnabledAttempted?: boolean };
|
||||
const stateAfterFirst = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean };
|
||||
expect(first.captionsEnabledAttempted).toBe(false);
|
||||
expect(stateAfterFirst.enabledAttempted).toBe(false);
|
||||
expect(captionButton.click).not.toHaveBeenCalled();
|
||||
|
||||
page.buttons = [leaveButton, captionButton];
|
||||
const second = JSON.parse(inspect()) as { captionsEnabledAttempted?: boolean };
|
||||
const second = JSON.parse(await inspect()) as { captionsEnabledAttempted?: boolean };
|
||||
const stateAfterSecond = windowState.__openclawMeetCaptions as { enabledAttempted?: boolean };
|
||||
expect(second.captionsEnabledAttempted).toBe(true);
|
||||
expect(stateAfterSecond.enabledAttempted).toBe(true);
|
||||
expect(captionButton.click).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("reports in-call Meet audio permission problems from button labels", () => {
|
||||
it("reports in-call Meet audio permission problems from button labels", async () => {
|
||||
const makeButton = (label: string) => ({
|
||||
disabled: false,
|
||||
innerText: "",
|
||||
@@ -2361,9 +2362,9 @@ describe("google-meet plugin", () => {
|
||||
captureCaptions: false,
|
||||
guestName: "OpenClaw Agent",
|
||||
})})`,
|
||||
).runInContext(context) as () => string;
|
||||
).runInContext(context) as () => string | Promise<string>;
|
||||
|
||||
const result = JSON.parse(inspect()) as {
|
||||
const result = JSON.parse(await inspect()) as {
|
||||
inCall?: boolean;
|
||||
manualActionRequired?: boolean;
|
||||
manualActionReason?: string;
|
||||
@@ -2376,7 +2377,7 @@ describe("google-meet plugin", () => {
|
||||
expect(result.manualActionMessage).toContain("Allow microphone/camera/speaker permissions");
|
||||
});
|
||||
|
||||
it("uses the local Meet microphone control instead of remote participant mute buttons", () => {
|
||||
it("uses the local Meet microphone control instead of remote participant mute buttons", async () => {
|
||||
const makeButton = (label: string, disabled = false) => ({
|
||||
disabled,
|
||||
innerText: "",
|
||||
@@ -2416,9 +2417,9 @@ describe("google-meet plugin", () => {
|
||||
captureCaptions: false,
|
||||
guestName: "OpenClaw Agent",
|
||||
})})`,
|
||||
).runInContext(context) as () => string;
|
||||
).runInContext(context) as () => string | Promise<string>;
|
||||
|
||||
const result = JSON.parse(inspect()) as { micMuted?: boolean; notes?: string[] };
|
||||
const result = JSON.parse(await inspect()) as { micMuted?: boolean; notes?: string[] };
|
||||
|
||||
expect(result.micMuted).toBe(true);
|
||||
expect(localMic.click).toHaveBeenCalledTimes(1);
|
||||
@@ -3526,7 +3527,7 @@ describe("google-meet plugin", () => {
|
||||
|
||||
const handle = await startCommandRealtimeAudioBridge({
|
||||
config: resolveGoogleMeetConfig({
|
||||
realtime: { provider: "openai", model: "gpt-realtime", agentId: "jay" },
|
||||
realtime: { strategy: "bidi", provider: "openai", model: "gpt-realtime", agentId: "jay" },
|
||||
}),
|
||||
fullConfig: {} as never,
|
||||
runtime: runtime as never,
|
||||
@@ -3579,6 +3580,7 @@ describe("google-meet plugin", () => {
|
||||
expect(outputProcess.kill).toHaveBeenCalledWith("SIGKILL");
|
||||
expect(replacementOutputStdinWrites).toEqual([Buffer.from([6, 7])]);
|
||||
outputProcess.emit("error", new Error("stale output process failed after clear"));
|
||||
outputStdin.emit("error", new Error("stale output pipe closed after clear"));
|
||||
expect(bridge.close).not.toHaveBeenCalled();
|
||||
expect(bridge.acknowledgeMark).toHaveBeenCalled();
|
||||
expect(bridge.triggerGreeting).not.toHaveBeenCalled();
|
||||
@@ -3616,6 +3618,7 @@ describe("google-meet plugin", () => {
|
||||
sampleRateHz: 24000,
|
||||
channels: 1,
|
||||
},
|
||||
autoRespondToAudio: true,
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
name: "openclaw_agent_consult",
|
||||
@@ -3635,13 +3638,14 @@ describe("google-meet plugin", () => {
|
||||
expect.objectContaining({
|
||||
messageProvider: "google-meet",
|
||||
agentId: "jay",
|
||||
sessionKey: "agent:jay:google-meet:meet-1",
|
||||
sandboxSessionKey: "agent:jay:google-meet:meet-1",
|
||||
spawnedBy: "agent:jay:main",
|
||||
sessionKey: "agent:jay:subagent:google-meet:meet-1",
|
||||
sandboxSessionKey: "agent:jay:subagent:google-meet:meet-1",
|
||||
thinkLevel: "high",
|
||||
toolsAllow: ["read", "web_search", "web_fetch", "x_search", "memory_search", "memory_get"],
|
||||
}),
|
||||
);
|
||||
expect(sessionStore).toHaveProperty("agent:jay:google-meet:meet-1");
|
||||
expect(sessionStore).toHaveProperty("agent:jay:subagent:google-meet:meet-1");
|
||||
|
||||
await handle.stop();
|
||||
expect(bridge.close).toHaveBeenCalled();
|
||||
@@ -3649,6 +3653,119 @@ describe("google-meet plugin", () => {
|
||||
expect(replacementOutputProcess.kill).toHaveBeenCalledWith("SIGTERM");
|
||||
});
|
||||
|
||||
it("defaults Chrome command-pair realtime to agent-driven talk-back", async () => {
|
||||
let callbacks: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
|
||||
const sendUserMessage = vi.fn();
|
||||
const bridge = {
|
||||
connect: vi.fn(async () => {}),
|
||||
sendAudio: vi.fn(),
|
||||
sendUserMessage,
|
||||
setMediaTimestamp: vi.fn(),
|
||||
submitToolResult: vi.fn(),
|
||||
acknowledgeMark: vi.fn(),
|
||||
close: vi.fn(),
|
||||
triggerGreeting: vi.fn(),
|
||||
isConnected: vi.fn(() => true),
|
||||
};
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
autoSelectOrder: 1,
|
||||
resolveConfig: ({ rawConfig }) => rawConfig,
|
||||
isConfigured: () => true,
|
||||
createBridge: (req) => {
|
||||
callbacks = req;
|
||||
return bridge;
|
||||
},
|
||||
};
|
||||
const inputStdout = new PassThrough();
|
||||
const makeProcess = (stdio: {
|
||||
stdin?: { write(chunk: unknown): unknown } | null;
|
||||
stdout?: { on(event: "data", listener: (chunk: unknown) => void): unknown } | null;
|
||||
}): TestBridgeProcess => {
|
||||
const proc = new EventEmitter() as unknown as TestBridgeProcess;
|
||||
proc.stdin = stdio.stdin;
|
||||
proc.stdout = stdio.stdout;
|
||||
proc.stderr = new PassThrough();
|
||||
proc.killed = false;
|
||||
proc.kill = vi.fn(() => {
|
||||
proc.killed = true;
|
||||
return true;
|
||||
});
|
||||
return proc;
|
||||
};
|
||||
const outputProcess = makeProcess({
|
||||
stdin: new Writable({
|
||||
write(_chunk, _encoding, done) {
|
||||
done();
|
||||
},
|
||||
}),
|
||||
stdout: null,
|
||||
});
|
||||
const inputProcess = makeProcess({ stdout: inputStdout, stdin: null });
|
||||
const spawnMock = vi.fn().mockReturnValueOnce(outputProcess).mockReturnValueOnce(inputProcess);
|
||||
const sessionStore: Record<string, unknown> = {};
|
||||
const runtime = {
|
||||
agent: {
|
||||
resolveAgentDir: vi.fn(() => "/tmp/agent"),
|
||||
resolveAgentWorkspaceDir: vi.fn(() => "/tmp/workspace"),
|
||||
ensureAgentWorkspace: vi.fn(async () => {}),
|
||||
session: {
|
||||
resolveStorePath: vi.fn(() => "/tmp/sessions.json"),
|
||||
loadSessionStore: vi.fn(() => sessionStore),
|
||||
saveSessionStore: vi.fn(async () => {}),
|
||||
updateSessionStore: vi.fn(async (_storePath, mutator) => mutator(sessionStore as never)),
|
||||
resolveSessionFilePath: vi.fn(() => "/tmp/session.json"),
|
||||
},
|
||||
runEmbeddedPiAgent: vi.fn(async () => ({
|
||||
payloads: [{ text: "The launch is still on track." }],
|
||||
meta: {},
|
||||
})),
|
||||
resolveAgentTimeoutMs: vi.fn(() => 1000),
|
||||
},
|
||||
};
|
||||
|
||||
const handle = await startCommandRealtimeAudioBridge({
|
||||
config: resolveGoogleMeetConfig({ realtime: { provider: "openai", agentId: "jay" } }),
|
||||
fullConfig: {} as never,
|
||||
runtime: runtime as never,
|
||||
meetingSessionId: "meet-1",
|
||||
inputCommand: ["capture-meet"],
|
||||
outputCommand: ["play-meet"],
|
||||
logger: noopLogger,
|
||||
providers: [provider],
|
||||
spawn: spawnMock,
|
||||
});
|
||||
|
||||
expect(callbacks).toMatchObject({
|
||||
autoRespondToAudio: false,
|
||||
tools: [],
|
||||
});
|
||||
callbacks?.onTranscript?.("user", "Are we still on track?", true);
|
||||
callbacks?.onTranscript?.("user", "Please include launch blockers.", true);
|
||||
|
||||
await vi.waitFor(() => {
|
||||
expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalledTimes(1);
|
||||
expect(runtime.agent.runEmbeddedPiAgent).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
agentId: "jay",
|
||||
spawnedBy: "agent:jay:main",
|
||||
sessionKey: "agent:jay:subagent:google-meet:meet-1",
|
||||
sandboxSessionKey: "agent:jay:subagent:google-meet:meet-1",
|
||||
}),
|
||||
);
|
||||
});
|
||||
expect(JSON.stringify(runtime.agent.runEmbeddedPiAgent.mock.calls[0]?.[0])).toContain(
|
||||
"Are we still on track?\\nPlease include launch blockers.",
|
||||
);
|
||||
expect(sendUserMessage).toHaveBeenCalledWith(
|
||||
expect.stringContaining(JSON.stringify("The launch is still on track.")),
|
||||
);
|
||||
expect(sessionStore).toHaveProperty("agent:jay:subagent:google-meet:meet-1");
|
||||
|
||||
await handle.stop();
|
||||
});
|
||||
|
||||
it("uses a local barge-in input command to clear active Chrome playback", async () => {
|
||||
let callbacks:
|
||||
| {
|
||||
@@ -3818,7 +3935,7 @@ describe("google-meet plugin", () => {
|
||||
|
||||
const handle = await startNodeRealtimeAudioBridge({
|
||||
config: resolveGoogleMeetConfig({
|
||||
realtime: { provider: "openai", model: "gpt-realtime" },
|
||||
realtime: { strategy: "bidi", provider: "openai", model: "gpt-realtime" },
|
||||
}),
|
||||
fullConfig: {} as never,
|
||||
runtime: runtime as never,
|
||||
@@ -3901,6 +4018,7 @@ describe("google-meet plugin", () => {
|
||||
sampleRateHz: 24000,
|
||||
channels: 1,
|
||||
},
|
||||
autoRespondToAudio: true,
|
||||
tools: [
|
||||
expect.objectContaining({
|
||||
name: "openclaw_agent_consult",
|
||||
|
||||
@@ -150,6 +150,10 @@ const googleMeetConfigSchema = {
|
||||
advanced: true,
|
||||
},
|
||||
"voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true },
|
||||
"realtime.strategy": {
|
||||
label: "Realtime Strategy",
|
||||
help: "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly.",
|
||||
},
|
||||
"realtime.provider": {
|
||||
label: "Realtime Provider",
|
||||
help: "Defaults to OpenAI; uses OPENAI_API_KEY when no provider config is set.",
|
||||
|
||||
@@ -143,6 +143,10 @@
|
||||
"label": "Voice Call Intro Message",
|
||||
"advanced": true
|
||||
},
|
||||
"realtime.strategy": {
|
||||
"label": "Realtime Strategy",
|
||||
"help": "Agent routes participant speech through OpenClaw before speaking; bidi lets the realtime model answer directly."
|
||||
},
|
||||
"realtime.provider": {
|
||||
"label": "Realtime Provider",
|
||||
"help": "Defaults to OpenAI; uses OPENAI_API_KEY when no provider config is set."
|
||||
@@ -404,6 +408,11 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"strategy": {
|
||||
"type": "string",
|
||||
"enum": ["agent", "bidi"],
|
||||
"default": "agent"
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"default": "openai"
|
||||
@@ -413,7 +422,7 @@
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string",
|
||||
"default": "You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call openclaw_agent_consult before answering."
|
||||
"default": "You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent strategy, wait for OpenClaw consult results and speak them exactly. In bidi strategy, answer directly and call openclaw_agent_consult for deeper reasoning, current information, or tools."
|
||||
},
|
||||
"introMessage": {
|
||||
"type": "string",
|
||||
|
||||
@@ -48,7 +48,8 @@ export async function consultOpenClawAgentForGoogleMeet(params: {
|
||||
transcript: Array<{ role: "user" | "assistant"; text: string }>;
|
||||
}): Promise<{ text: string }> {
|
||||
const agentId = normalizeAgentId(params.config.realtime.agentId);
|
||||
const sessionKey = `agent:${agentId}:google-meet:${params.meetingSessionId}`;
|
||||
const requesterSessionKey = `agent:${agentId}:main`;
|
||||
const sessionKey = `agent:${agentId}:subagent:google-meet:${params.meetingSessionId}`;
|
||||
return await consultRealtimeVoiceAgent({
|
||||
cfg: params.fullConfig,
|
||||
agentRuntime: params.runtime.agent,
|
||||
@@ -58,6 +59,7 @@ export async function consultOpenClawAgentForGoogleMeet(params: {
|
||||
messageProvider: "google-meet",
|
||||
lane: "google-meet",
|
||||
runIdPrefix: `google-meet:${params.meetingSessionId}`,
|
||||
spawnedBy: requesterSessionKey,
|
||||
args: params.args,
|
||||
transcript: params.transcript,
|
||||
surface: "a private Google Meet",
|
||||
|
||||
@@ -353,6 +353,9 @@ function writeDoctorStatus(status: Awaited<ReturnType<GoogleMeetRuntime["status"
|
||||
"provider: %s",
|
||||
session.chrome?.audioBridge?.provider ?? session.realtime.provider ?? "n/a",
|
||||
);
|
||||
if (session.realtime.enabled) {
|
||||
writeStdoutLine("realtime strategy: %s", session.realtime.strategy ?? "agent");
|
||||
}
|
||||
writeStdoutLine("in call: %s", formatBoolean(health?.inCall));
|
||||
writeStdoutLine("lobby waiting: %s", formatBoolean(health?.lobbyWaiting));
|
||||
writeStdoutLine("captioning: %s", formatBoolean(health?.captioning));
|
||||
@@ -372,6 +375,11 @@ function writeDoctorStatus(status: Awaited<ReturnType<GoogleMeetRuntime["status"
|
||||
writeStdoutLine("realtime ready: %s", formatBoolean(health?.realtimeReady));
|
||||
writeStdoutLine("audio input active: %s", formatBoolean(health?.audioInputActive));
|
||||
writeStdoutLine("audio output active: %s", formatBoolean(health?.audioOutputActive));
|
||||
writeStdoutLine("meet output routed: %s", formatBoolean(health?.audioOutputRouted));
|
||||
if (health?.audioOutputDeviceLabel || health?.audioOutputRouteError) {
|
||||
writeStdoutLine("meet output device: %s", formatOptional(health.audioOutputDeviceLabel));
|
||||
writeStdoutLine("meet output route error: %s", formatOptional(health.audioOutputRouteError));
|
||||
}
|
||||
writeStdoutLine(
|
||||
"last input: %s (%s bytes)",
|
||||
formatOptional(health?.lastInputAt),
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
|
||||
export type GoogleMeetTransport = "chrome" | "chrome-node" | "twilio";
|
||||
export type GoogleMeetMode = "realtime" | "transcribe";
|
||||
export type GoogleMeetRealtimeStrategy = "agent" | "bidi";
|
||||
type GoogleMeetChromeAudioFormat = "pcm16-24khz" | "g711-ulaw-8khz";
|
||||
export type GoogleMeetToolPolicy = RealtimeVoiceAgentConsultToolPolicy;
|
||||
|
||||
@@ -60,6 +61,7 @@ export type GoogleMeetConfig = {
|
||||
introMessage?: string;
|
||||
};
|
||||
realtime: {
|
||||
strategy: GoogleMeetRealtimeStrategy;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
instructions?: string;
|
||||
@@ -160,7 +162,7 @@ const DEFAULT_GOOGLE_MEET_BARGE_IN_RMS_THRESHOLD = 650;
|
||||
const DEFAULT_GOOGLE_MEET_BARGE_IN_PEAK_THRESHOLD = 2500;
|
||||
const DEFAULT_GOOGLE_MEET_BARGE_IN_COOLDOWN_MS = 900;
|
||||
|
||||
const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw agent. Keep spoken replies brief and natural. When a question needs deeper reasoning, current information, or tools, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} before answering.`;
|
||||
const DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS = `You are joining a private Google Meet as an OpenClaw voice transport. Keep spoken replies brief and natural. In agent strategy, wait for OpenClaw consult results and speak them exactly. In bidi strategy, answer directly and call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} for deeper reasoning, current information, or tools.`;
|
||||
const DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE = "Say exactly: I'm here and listening.";
|
||||
|
||||
const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
@@ -195,6 +197,7 @@ const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
postDtmfSpeechDelayMs: 5_000,
|
||||
},
|
||||
realtime: {
|
||||
strategy: "agent",
|
||||
provider: "openai",
|
||||
instructions: DEFAULT_GOOGLE_MEET_REALTIME_INSTRUCTIONS,
|
||||
introMessage: DEFAULT_GOOGLE_MEET_REALTIME_INTRO_MESSAGE,
|
||||
@@ -321,6 +324,14 @@ function resolveMode(value: unknown, fallback: GoogleMeetMode): GoogleMeetMode {
|
||||
return normalized === "realtime" || normalized === "transcribe" ? normalized : fallback;
|
||||
}
|
||||
|
||||
function resolveRealtimeStrategy(
|
||||
value: unknown,
|
||||
fallback: GoogleMeetRealtimeStrategy,
|
||||
): GoogleMeetRealtimeStrategy {
|
||||
const normalized = normalizeOptionalLowercaseString(value);
|
||||
return normalized === "agent" || normalized === "bidi" ? normalized : fallback;
|
||||
}
|
||||
|
||||
function resolveChromeAudioFormat(value: unknown): GoogleMeetChromeAudioFormat | undefined {
|
||||
const normalized = normalizeOptionalString(value)?.toLowerCase().replaceAll("_", "-");
|
||||
switch (normalized) {
|
||||
@@ -464,6 +475,10 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
introMessage: normalizeOptionalString(voiceCall.introMessage),
|
||||
},
|
||||
realtime: {
|
||||
strategy: resolveRealtimeStrategy(
|
||||
realtime.strategy,
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.realtime.strategy,
|
||||
),
|
||||
provider:
|
||||
normalizeOptionalString(realtime.provider) ?? DEFAULT_GOOGLE_MEET_CONFIG.realtime.provider,
|
||||
model: normalizeOptionalString(realtime.model) ?? DEFAULT_GOOGLE_MEET_CONFIG.realtime.model,
|
||||
|
||||
@@ -124,6 +124,11 @@ function attachOutputProcessHandlers(session: NodeBridgeSession, outputProcess:
|
||||
stopSession(session);
|
||||
}
|
||||
});
|
||||
outputProcess.stdin?.on?.("error", () => {
|
||||
if (session.output === outputProcess) {
|
||||
stopSession(session);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function startOutputProcess(command: { command: string; args: string[] }) {
|
||||
@@ -241,7 +246,12 @@ function pushAudio(params: Record<string, unknown>) {
|
||||
const audio = Buffer.from(base64, "base64");
|
||||
session.lastOutputAt = new Date().toISOString();
|
||||
session.lastOutputBytes += audio.byteLength;
|
||||
session.output?.stdin?.write(audio);
|
||||
try {
|
||||
session.output?.stdin?.write(audio);
|
||||
} catch {
|
||||
stopSession(session);
|
||||
throw new Error(`bridge is not open: ${bridgeId}`);
|
||||
}
|
||||
return { bridgeId, ok: true };
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ import {
|
||||
import type { GoogleMeetConfig } from "./config.js";
|
||||
import {
|
||||
getGoogleMeetRealtimeTranscriptHealth,
|
||||
buildGoogleMeetSpeakExactUserMessage,
|
||||
GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS,
|
||||
getGoogleMeetRealtimeEventHealth,
|
||||
recordGoogleMeetRealtimeTranscript,
|
||||
recordGoogleMeetRealtimeEvent,
|
||||
@@ -73,12 +75,95 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
});
|
||||
const transcript: GoogleMeetRealtimeTranscriptEntry[] = [];
|
||||
const realtimeEvents: GoogleMeetRealtimeEventEntry[] = [];
|
||||
const strategy = params.config.realtime.strategy;
|
||||
let agentConsultActive = false;
|
||||
let pendingAgentQuestion: string | undefined;
|
||||
let agentConsultDebounceTimer: ReturnType<typeof setTimeout> | undefined;
|
||||
const enqueueAgentConsultForUserTranscript = (question: string): void => {
|
||||
const trimmed = question.trim();
|
||||
if (!trimmed || stopped) {
|
||||
return;
|
||||
}
|
||||
pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed;
|
||||
if (agentConsultDebounceTimer) {
|
||||
clearTimeout(agentConsultDebounceTimer);
|
||||
}
|
||||
agentConsultDebounceTimer = setTimeout(() => {
|
||||
agentConsultDebounceTimer = undefined;
|
||||
const queuedQuestion = pendingAgentQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
if (queuedQuestion && !stopped) {
|
||||
void runAgentConsultForUserTranscript(queuedQuestion);
|
||||
}
|
||||
}, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS);
|
||||
agentConsultDebounceTimer.unref?.();
|
||||
};
|
||||
const runAgentConsultForUserTranscript = async (question: string): Promise<void> => {
|
||||
const trimmed = question.trim();
|
||||
if (!trimmed || stopped) {
|
||||
return;
|
||||
}
|
||||
if (agentConsultActive) {
|
||||
pendingAgentQuestion = trimmed;
|
||||
return;
|
||||
}
|
||||
agentConsultActive = true;
|
||||
let nextQuestion: string | undefined = trimmed;
|
||||
try {
|
||||
while (nextQuestion) {
|
||||
if (stopped) {
|
||||
return;
|
||||
}
|
||||
const currentQuestion = nextQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
params.logger.info(`[google-meet] node realtime agent consult: ${currentQuestion}`);
|
||||
const result = await consultOpenClawAgentForGoogleMeet({
|
||||
config: params.config,
|
||||
fullConfig: params.fullConfig,
|
||||
runtime: params.runtime,
|
||||
logger: params.logger,
|
||||
meetingSessionId: params.meetingSessionId,
|
||||
args: {
|
||||
question: currentQuestion,
|
||||
responseStyle: "Brief, natural spoken answer for a live meeting.",
|
||||
},
|
||||
transcript,
|
||||
});
|
||||
if (!stopped && result.text.trim()) {
|
||||
bridge?.sendUserMessage(buildGoogleMeetSpeakExactUserMessage(result.text.trim()));
|
||||
}
|
||||
nextQuestion = pendingAgentQuestion;
|
||||
}
|
||||
} catch (error) {
|
||||
params.logger.warn(
|
||||
`[google-meet] node realtime agent consult failed: ${formatErrorMessage(error)}`,
|
||||
);
|
||||
if (!stopped) {
|
||||
bridge?.sendUserMessage(
|
||||
buildGoogleMeetSpeakExactUserMessage(
|
||||
"I hit an error while checking that. Please try again.",
|
||||
),
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
agentConsultActive = false;
|
||||
const queuedQuestion = pendingAgentQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
if (queuedQuestion && !stopped) {
|
||||
void runAgentConsultForUserTranscript(queuedQuestion);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const stop = async () => {
|
||||
if (stopped) {
|
||||
return;
|
||||
}
|
||||
stopped = true;
|
||||
if (agentConsultDebounceTimer) {
|
||||
clearTimeout(agentConsultDebounceTimer);
|
||||
agentConsultDebounceTimer = undefined;
|
||||
}
|
||||
try {
|
||||
bridge?.close();
|
||||
} catch (error) {
|
||||
@@ -106,9 +191,11 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
autoRespondToAudio: strategy === "bidi",
|
||||
triggerGreetingOnReady: false,
|
||||
markStrategy: "ack-immediately",
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
tools:
|
||||
strategy === "bidi" ? resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy) : [],
|
||||
audioSink: {
|
||||
isOpen: () => !stopped,
|
||||
sendAudio: (audio) => {
|
||||
@@ -157,16 +244,32 @@ export async function startNodeRealtimeAudioBridge(params: {
|
||||
if (isFinal) {
|
||||
recordGoogleMeetRealtimeTranscript(transcript, role, text);
|
||||
params.logger.info(`[google-meet] node realtime ${role}: ${text}`);
|
||||
if (role === "user" && strategy === "agent") {
|
||||
enqueueAgentConsultForUserTranscript(text);
|
||||
}
|
||||
}
|
||||
},
|
||||
onEvent: (event) => {
|
||||
recordGoogleMeetRealtimeEvent(realtimeEvents, event);
|
||||
if (event.type === "error" || event.type === "response.done") {
|
||||
if (
|
||||
event.type === "error" ||
|
||||
event.type === "response.done" ||
|
||||
event.type === "input_audio_buffer.speech_started" ||
|
||||
event.type === "input_audio_buffer.speech_stopped" ||
|
||||
event.type === "conversation.item.input_audio_transcription.completed" ||
|
||||
event.type === "conversation.item.input_audio_transcription.failed"
|
||||
) {
|
||||
const detail = event.detail ? ` ${event.detail}` : "";
|
||||
params.logger.info(`[google-meet] node realtime ${event.direction}:${event.type}${detail}`);
|
||||
}
|
||||
},
|
||||
onToolCall: (event, session) => {
|
||||
if (strategy !== "bidi") {
|
||||
session.submitToolResult(event.callId || event.itemId, {
|
||||
error: `Tool "${event.name}" is only available in bidi realtime strategy`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (event.name !== GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME) {
|
||||
session.submitToolResult(event.callId || event.itemId, {
|
||||
error: `Tool "${event.name}" not available`,
|
||||
|
||||
@@ -99,10 +99,15 @@ export type GoogleMeetRealtimeEventEntry = RealtimeVoiceBridgeEvent & {
|
||||
at: string;
|
||||
};
|
||||
|
||||
export const GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS = 900;
|
||||
|
||||
export function recordGoogleMeetRealtimeEvent(
|
||||
events: GoogleMeetRealtimeEventEntry[],
|
||||
event: RealtimeVoiceBridgeEvent,
|
||||
) {
|
||||
if (event.direction === "client" && event.type === "input_audio_buffer.append") {
|
||||
return;
|
||||
}
|
||||
events.push({ at: new Date().toISOString(), ...event });
|
||||
if (events.length > 40) {
|
||||
events.splice(0, events.length - 40);
|
||||
@@ -173,6 +178,13 @@ export function resolveGoogleMeetRealtimeProvider(params: {
|
||||
});
|
||||
}
|
||||
|
||||
export function buildGoogleMeetSpeakExactUserMessage(text: string): string {
|
||||
return [
|
||||
"Speak this exact OpenClaw answer to the meeting, without adding, removing, or rephrasing words.",
|
||||
`Answer: ${JSON.stringify(text)}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
export async function startCommandRealtimeAudioBridge(params: {
|
||||
config: GoogleMeetConfig;
|
||||
fullConfig: OpenClawConfig;
|
||||
@@ -212,6 +224,7 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
let lastOutputAtMs = 0;
|
||||
let lastOutputPlayableUntilMs = 0;
|
||||
let bargeInInputProcess: BridgeProcess | undefined;
|
||||
let agentConsultDebounceTimer: ReturnType<typeof setTimeout> | undefined;
|
||||
|
||||
const suppressInputForOutput = (audio: Buffer) => {
|
||||
const bytesPerMs = params.config.chrome.audioFormat === "g711-ulaw-8khz" ? 8 : 48;
|
||||
@@ -254,6 +267,10 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
return;
|
||||
}
|
||||
stopped = true;
|
||||
if (agentConsultDebounceTimer) {
|
||||
clearTimeout(agentConsultDebounceTimer);
|
||||
agentConsultDebounceTimer = undefined;
|
||||
}
|
||||
try {
|
||||
bridge?.close();
|
||||
} catch (error) {
|
||||
@@ -279,6 +296,12 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
}
|
||||
fail("audio output command")(error);
|
||||
});
|
||||
proc.stdin?.on?.("error", (error: Error) => {
|
||||
if (proc !== outputProcess) {
|
||||
return;
|
||||
}
|
||||
fail("audio output command")(error);
|
||||
});
|
||||
proc.on("exit", (code, signal) => {
|
||||
if (proc !== outputProcess) {
|
||||
return;
|
||||
@@ -310,6 +333,13 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
);
|
||||
terminateProcess(previousOutput, "SIGKILL");
|
||||
};
|
||||
const writeOutputAudio = (audio: Buffer) => {
|
||||
try {
|
||||
outputProcess.stdin?.write(audio);
|
||||
} catch (error) {
|
||||
fail("audio output command")(error as Error);
|
||||
}
|
||||
};
|
||||
const startHumanBargeInMonitor = () => {
|
||||
const commandArgv = params.config.chrome.bargeInInputCommand;
|
||||
if (!commandArgv) {
|
||||
@@ -384,17 +414,97 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
fullConfig: params.fullConfig,
|
||||
providers: params.providers,
|
||||
});
|
||||
const strategy = params.config.realtime.strategy;
|
||||
const transcript: GoogleMeetRealtimeTranscriptEntry[] = [];
|
||||
const realtimeEvents: GoogleMeetRealtimeEventEntry[] = [];
|
||||
let agentConsultActive = false;
|
||||
let pendingAgentQuestion: string | undefined;
|
||||
const enqueueAgentConsultForUserTranscript = (question: string): void => {
|
||||
const trimmed = question.trim();
|
||||
if (!trimmed || stopped) {
|
||||
return;
|
||||
}
|
||||
pendingAgentQuestion = pendingAgentQuestion ? `${pendingAgentQuestion}\n${trimmed}` : trimmed;
|
||||
if (agentConsultDebounceTimer) {
|
||||
clearTimeout(agentConsultDebounceTimer);
|
||||
}
|
||||
agentConsultDebounceTimer = setTimeout(() => {
|
||||
agentConsultDebounceTimer = undefined;
|
||||
const queuedQuestion = pendingAgentQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
if (queuedQuestion && !stopped) {
|
||||
void runAgentConsultForUserTranscript(queuedQuestion);
|
||||
}
|
||||
}, GOOGLE_MEET_AGENT_TRANSCRIPT_DEBOUNCE_MS);
|
||||
agentConsultDebounceTimer.unref?.();
|
||||
};
|
||||
const runAgentConsultForUserTranscript = async (question: string): Promise<void> => {
|
||||
const trimmed = question.trim();
|
||||
if (!trimmed || stopped) {
|
||||
return;
|
||||
}
|
||||
if (agentConsultActive) {
|
||||
pendingAgentQuestion = trimmed;
|
||||
return;
|
||||
}
|
||||
agentConsultActive = true;
|
||||
let nextQuestion: string | undefined = trimmed;
|
||||
try {
|
||||
while (nextQuestion) {
|
||||
if (stopped) {
|
||||
return;
|
||||
}
|
||||
const currentQuestion = nextQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
params.logger.info(`[google-meet] realtime agent consult: ${currentQuestion}`);
|
||||
const result = await consultOpenClawAgentForGoogleMeet({
|
||||
config: params.config,
|
||||
fullConfig: params.fullConfig,
|
||||
runtime: params.runtime,
|
||||
logger: params.logger,
|
||||
meetingSessionId: params.meetingSessionId,
|
||||
args: {
|
||||
question: currentQuestion,
|
||||
responseStyle: "Brief, natural spoken answer for a live meeting.",
|
||||
},
|
||||
transcript,
|
||||
});
|
||||
if (!stopped && result.text.trim()) {
|
||||
bridge?.sendUserMessage(buildGoogleMeetSpeakExactUserMessage(result.text.trim()));
|
||||
}
|
||||
nextQuestion = pendingAgentQuestion;
|
||||
}
|
||||
} catch (error) {
|
||||
params.logger.warn(
|
||||
`[google-meet] realtime agent consult failed: ${formatErrorMessage(error)}`,
|
||||
);
|
||||
if (!stopped) {
|
||||
bridge?.sendUserMessage(
|
||||
buildGoogleMeetSpeakExactUserMessage(
|
||||
"I hit an error while checking that. Please try again.",
|
||||
),
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
agentConsultActive = false;
|
||||
const queuedQuestion = pendingAgentQuestion;
|
||||
pendingAgentQuestion = undefined;
|
||||
if (queuedQuestion && !stopped) {
|
||||
void runAgentConsultForUserTranscript(queuedQuestion);
|
||||
}
|
||||
}
|
||||
};
|
||||
bridge = createRealtimeVoiceBridgeSession({
|
||||
provider: resolved.provider,
|
||||
providerConfig: resolved.providerConfig,
|
||||
audioFormat: resolveGoogleMeetRealtimeAudioFormat(params.config),
|
||||
instructions: params.config.realtime.instructions,
|
||||
initialGreetingInstructions: params.config.realtime.introMessage,
|
||||
autoRespondToAudio: strategy === "bidi",
|
||||
triggerGreetingOnReady: false,
|
||||
markStrategy: "ack-immediately",
|
||||
tools: resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy),
|
||||
tools:
|
||||
strategy === "bidi" ? resolveGoogleMeetRealtimeTools(params.config.realtime.toolPolicy) : [],
|
||||
audioSink: {
|
||||
isOpen: () => !stopped,
|
||||
sendAudio: (audio) => {
|
||||
@@ -402,7 +512,7 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
lastOutputAt = new Date().toISOString();
|
||||
lastOutputBytes += audio.byteLength;
|
||||
suppressInputForOutput(audio);
|
||||
outputProcess.stdin?.write(audio);
|
||||
writeOutputAudio(audio);
|
||||
},
|
||||
clearAudio: clearOutputPlayback,
|
||||
},
|
||||
@@ -410,16 +520,32 @@ export async function startCommandRealtimeAudioBridge(params: {
|
||||
if (isFinal) {
|
||||
recordGoogleMeetRealtimeTranscript(transcript, role, text);
|
||||
params.logger.info(`[google-meet] realtime ${role}: ${text}`);
|
||||
if (role === "user" && strategy === "agent") {
|
||||
enqueueAgentConsultForUserTranscript(text);
|
||||
}
|
||||
}
|
||||
},
|
||||
onEvent: (event) => {
|
||||
recordGoogleMeetRealtimeEvent(realtimeEvents, event);
|
||||
if (event.type === "error" || event.type === "response.done") {
|
||||
if (
|
||||
event.type === "error" ||
|
||||
event.type === "response.done" ||
|
||||
event.type === "input_audio_buffer.speech_started" ||
|
||||
event.type === "input_audio_buffer.speech_stopped" ||
|
||||
event.type === "conversation.item.input_audio_transcription.completed" ||
|
||||
event.type === "conversation.item.input_audio_transcription.failed"
|
||||
) {
|
||||
const detail = event.detail ? ` ${event.detail}` : "";
|
||||
params.logger.info(`[google-meet] realtime ${event.direction}:${event.type}${detail}`);
|
||||
}
|
||||
},
|
||||
onToolCall: (event, session) => {
|
||||
if (strategy !== "bidi") {
|
||||
session.submitToolResult(event.callId || event.itemId, {
|
||||
error: `Tool "${event.name}" is only available in bidi realtime strategy`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (event.name !== GOOGLE_MEET_AGENT_CONSULT_TOOL_NAME) {
|
||||
session.submitToolResult(event.callId || event.itemId, {
|
||||
error: `Tool "${event.name}" not available`,
|
||||
|
||||
@@ -392,6 +392,7 @@ export class GoogleMeetRuntime {
|
||||
: "signed-in Google Chrome profile",
|
||||
realtime: {
|
||||
enabled: mode === "realtime",
|
||||
strategy: this.params.config.realtime.strategy,
|
||||
provider: this.params.config.realtime.provider,
|
||||
model: this.params.config.realtime.model,
|
||||
toolPolicy: this.params.config.realtime.toolPolicy,
|
||||
|
||||
@@ -217,6 +217,9 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef
|
||||
lastCaptionSpeaker?: string;
|
||||
lastCaptionText?: string;
|
||||
recentTranscript?: GoogleMeetChromeHealth["recentTranscript"];
|
||||
audioOutputRouted?: boolean;
|
||||
audioOutputDeviceLabel?: string;
|
||||
audioOutputRouteError?: string;
|
||||
manualActionRequired?: boolean;
|
||||
manualActionReason?: GoogleMeetChromeHealth["manualActionReason"];
|
||||
manualActionMessage?: string;
|
||||
@@ -236,6 +239,9 @@ function parseMeetBrowserStatus(result: unknown): GoogleMeetChromeHealth | undef
|
||||
lastCaptionSpeaker: parsed.lastCaptionSpeaker,
|
||||
lastCaptionText: parsed.lastCaptionText,
|
||||
recentTranscript: parsed.recentTranscript,
|
||||
audioOutputRouted: parsed.audioOutputRouted,
|
||||
audioOutputDeviceLabel: parsed.audioOutputDeviceLabel,
|
||||
audioOutputRouteError: parsed.audioOutputRouteError,
|
||||
manualActionRequired: parsed.manualActionRequired,
|
||||
manualActionReason: parsed.manualActionReason,
|
||||
manualActionMessage: parsed.manualActionMessage,
|
||||
@@ -329,7 +335,7 @@ function meetStatusScript(params: {
|
||||
guestName: string;
|
||||
readOnly?: boolean;
|
||||
}) {
|
||||
return `() => {
|
||||
return `async () => {
|
||||
const text = (node) => (node?.innerText || node?.textContent || "").trim();
|
||||
const allowMicrophone = ${JSON.stringify(params.allowMicrophone)};
|
||||
const captureCaptions = ${JSON.stringify(params.captureCaptions)};
|
||||
@@ -345,6 +351,9 @@ function meetStatusScript(params: {
|
||||
.join(" ");
|
||||
const buttonLabels = buttons.map(buttonLabel).filter(Boolean);
|
||||
const notes = [];
|
||||
let audioOutputRouted;
|
||||
let audioOutputDeviceLabel;
|
||||
let audioOutputRouteError;
|
||||
const findButton = (pattern) =>
|
||||
buttons.find((button) => {
|
||||
const label = buttonLabel(button);
|
||||
@@ -398,6 +407,55 @@ function meetStatusScript(params: {
|
||||
notes.push("Skipped Meet microphone prompt for observe-only mode.");
|
||||
}
|
||||
const inCall = buttons.some((button) => /leave call/i.test(button.getAttribute('aria-label') || text(button)));
|
||||
const routeMeetAudioOutput = async () => {
|
||||
if (
|
||||
!allowMicrophone ||
|
||||
typeof navigator === 'undefined' ||
|
||||
!navigator.mediaDevices?.enumerateDevices
|
||||
) return;
|
||||
const mediaElements = [...document.querySelectorAll('audio, video')]
|
||||
.filter((el) => typeof el.setSinkId === 'function');
|
||||
if (mediaElements.length === 0) return;
|
||||
try {
|
||||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||
const output = devices.find((device) =>
|
||||
device.kind === 'audiooutput' && /\\bBlackHole\\s+2ch\\b/i.test(device.label || '')
|
||||
) || devices.find((device) =>
|
||||
device.kind === 'audiooutput' && /\\bBlackHole\\b/i.test(device.label || '')
|
||||
);
|
||||
if (!output?.deviceId) {
|
||||
if (devices.some((device) => device.kind === 'audiooutput')) {
|
||||
notes.push("BlackHole 2ch speaker output was not visible to Meet.");
|
||||
}
|
||||
return;
|
||||
}
|
||||
let routed = 0;
|
||||
for (const element of mediaElements) {
|
||||
if (element.sinkId !== output.deviceId) {
|
||||
if (readOnly) {
|
||||
continue;
|
||||
}
|
||||
await element.setSinkId(output.deviceId);
|
||||
routed += 1;
|
||||
}
|
||||
}
|
||||
audioOutputRouted = mediaElements.some((element) => element.sinkId === output.deviceId);
|
||||
audioOutputDeviceLabel = output.label || "BlackHole 2ch";
|
||||
if (!readOnly && audioOutputRouted) {
|
||||
notes.push(
|
||||
routed > 0
|
||||
? \`Routed Meet media output to \${audioOutputDeviceLabel}.\`
|
||||
: \`Meet media output already routed to \${audioOutputDeviceLabel}.\`
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
audioOutputRouteError = error?.message || String(error);
|
||||
notes.push(\`Could not route Meet speaker output to BlackHole 2ch: \${audioOutputRouteError}\`);
|
||||
}
|
||||
};
|
||||
if (inCall) {
|
||||
await routeMeetAudioOutput();
|
||||
}
|
||||
let captioning = false;
|
||||
let captionsEnabledAttempted = false;
|
||||
let transcriptLines = 0;
|
||||
@@ -520,6 +578,9 @@ function meetStatusScript(params: {
|
||||
lastCaptionSpeaker,
|
||||
lastCaptionText,
|
||||
recentTranscript,
|
||||
audioOutputRouted,
|
||||
audioOutputDeviceLabel,
|
||||
audioOutputRouteError,
|
||||
manualActionRequired: Boolean(manualActionReason),
|
||||
manualActionReason,
|
||||
manualActionMessage,
|
||||
|
||||
@@ -71,6 +71,9 @@ export type GoogleMeetChromeHealth = {
|
||||
realtimeReady?: boolean;
|
||||
audioInputActive?: boolean;
|
||||
audioOutputActive?: boolean;
|
||||
audioOutputRouted?: boolean;
|
||||
audioOutputDeviceLabel?: string;
|
||||
audioOutputRouteError?: string;
|
||||
lastInputAt?: string;
|
||||
lastOutputAt?: string;
|
||||
lastSuppressedInputAt?: string;
|
||||
@@ -100,6 +103,7 @@ export type GoogleMeetSession = {
|
||||
participantIdentity: string;
|
||||
realtime: {
|
||||
enabled: boolean;
|
||||
strategy?: string;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
toolPolicy: string;
|
||||
|
||||
@@ -84,6 +84,9 @@ type SentRealtimeEvent = {
|
||||
session?: {
|
||||
input_audio_format?: string;
|
||||
output_audio_format?: string;
|
||||
turn_detection?: {
|
||||
create_response?: boolean;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
@@ -415,6 +418,80 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
expect(bridge.isConnected()).toBe(false);
|
||||
});
|
||||
|
||||
it("can disable automatic audio turn responses for agent-routed voice loops", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
autoRespondToAudio: false,
|
||||
onAudio: vi.fn(),
|
||||
onClearAudio: vi.fn(),
|
||||
});
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
await connecting;
|
||||
|
||||
expect(parseSent(socket)[0]?.session).toMatchObject({
|
||||
turn_detection: expect.objectContaining({
|
||||
create_response: false,
|
||||
}),
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps assistant playback active on server VAD when automatic audio responses are disabled", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onAudio = vi.fn();
|
||||
const onClearAudio = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
providerConfig: { apiKey: "sk-test" }, // pragma: allowlist secret
|
||||
autoRespondToAudio: false,
|
||||
onAudio,
|
||||
onClearAudio,
|
||||
});
|
||||
const connecting = bridge.connect();
|
||||
const socket = FakeWebSocket.instances[0];
|
||||
if (!socket) {
|
||||
throw new Error("expected bridge to create a websocket");
|
||||
}
|
||||
|
||||
socket.readyState = FakeWebSocket.OPEN;
|
||||
socket.emit("open");
|
||||
socket.emit("message", Buffer.from(JSON.stringify({ type: "session.updated" })));
|
||||
await connecting;
|
||||
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(JSON.stringify({ type: "response.created", response: { id: "resp_1" } })),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(
|
||||
JSON.stringify({
|
||||
type: "response.audio.delta",
|
||||
item_id: "item_1",
|
||||
delta: Buffer.from("assistant audio").toString("base64"),
|
||||
}),
|
||||
),
|
||||
);
|
||||
socket.emit(
|
||||
"message",
|
||||
Buffer.from(JSON.stringify({ type: "input_audio_buffer.speech_started" })),
|
||||
);
|
||||
|
||||
expect(onAudio).toHaveBeenCalledTimes(1);
|
||||
expect(onClearAudio).not.toHaveBeenCalled();
|
||||
expect(parseSent(socket)).not.toContainEqual({ type: "response.cancel" });
|
||||
expect(parseSent(socket)).not.toContainEqual(
|
||||
expect.objectContaining({ type: "conversation.item.truncate" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("can request PCM16 24 kHz realtime audio for Chrome command-pair bridges", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const bridge = provider.createBridge({
|
||||
@@ -566,7 +643,7 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("creates an explicit user item and audio response for manual speech", async () => {
|
||||
it("creates an explicit user item and response for manual speech", async () => {
|
||||
const provider = buildOpenAIRealtimeVoiceProvider();
|
||||
const onEvent = vi.fn();
|
||||
const bridge = provider.createBridge({
|
||||
@@ -604,11 +681,9 @@ describe("buildOpenAIRealtimeVoiceProvider", () => {
|
||||
},
|
||||
{
|
||||
type: "response.create",
|
||||
response: {
|
||||
output_modalities: ["audio", "text"],
|
||||
},
|
||||
},
|
||||
]);
|
||||
expect(JSON.stringify(parseSent(socket).at(-1))).not.toContain("output_modalities");
|
||||
expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "conversation.item.create" });
|
||||
expect(onEvent).toHaveBeenCalledWith({ direction: "client", type: "response.create" });
|
||||
});
|
||||
|
||||
@@ -266,12 +266,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
content: [{ type: "input_text", text }],
|
||||
},
|
||||
});
|
||||
this.sendEvent({
|
||||
type: "response.create",
|
||||
response: {
|
||||
output_modalities: ["audio", "text"],
|
||||
},
|
||||
});
|
||||
this.sendEvent({ type: "response.create" });
|
||||
}
|
||||
|
||||
triggerGreeting(instructions?: string): void {
|
||||
@@ -537,7 +532,7 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
threshold: cfg.vadThreshold ?? 0.5,
|
||||
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
|
||||
silence_duration_ms: cfg.silenceDurationMs ?? 500,
|
||||
create_response: true,
|
||||
create_response: cfg.autoRespondToAudio ?? true,
|
||||
},
|
||||
temperature: cfg.temperature ?? 0.8,
|
||||
...(cfg.tools && cfg.tools.length > 0
|
||||
@@ -599,7 +594,9 @@ class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
}
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
this.handleBargeIn();
|
||||
if (this.config.autoRespondToAudio ?? true) {
|
||||
this.handleBargeIn();
|
||||
}
|
||||
return;
|
||||
|
||||
case "response.audio_transcript.delta":
|
||||
|
||||
@@ -39,6 +39,7 @@ export async function consultRealtimeVoiceAgent(params: {
|
||||
assistantLabel?: string;
|
||||
questionSourceLabel?: string;
|
||||
agentId?: string;
|
||||
spawnedBy?: string | null;
|
||||
provider?: RunEmbeddedPiAgentParams["provider"];
|
||||
model?: RunEmbeddedPiAgentParams["model"];
|
||||
thinkLevel?: RunEmbeddedPiAgentParams["thinkLevel"];
|
||||
@@ -73,6 +74,7 @@ export async function consultRealtimeVoiceAgent(params: {
|
||||
sessionKey: params.sessionKey,
|
||||
sandboxSessionKey: resolveRealtimeVoiceAgentSandboxSessionKey(agentId, params.sessionKey),
|
||||
agentId,
|
||||
spawnedBy: params.spawnedBy,
|
||||
messageProvider: params.messageProvider,
|
||||
sessionFile,
|
||||
workspaceDir,
|
||||
|
||||
@@ -86,6 +86,7 @@ export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
audioFormat?: RealtimeVoiceAudioFormat;
|
||||
instructions?: string;
|
||||
autoRespondToAudio?: boolean;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
};
|
||||
|
||||
|
||||
@@ -79,6 +79,28 @@ describe("realtime voice bridge session runtime", () => {
|
||||
expect(request?.audioFormat).toEqual(REALTIME_VOICE_AUDIO_FORMAT_PCM16_24KHZ);
|
||||
});
|
||||
|
||||
it("passes the audio auto-response preference to the provider bridge", () => {
|
||||
let request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
|
||||
const provider: RealtimeVoiceProviderPlugin = {
|
||||
id: "test",
|
||||
label: "Test",
|
||||
isConfigured: () => true,
|
||||
createBridge: (nextRequest) => {
|
||||
request = nextRequest;
|
||||
return makeBridge();
|
||||
},
|
||||
};
|
||||
|
||||
createRealtimeVoiceBridgeSession({
|
||||
provider,
|
||||
providerConfig: {},
|
||||
autoRespondToAudio: false,
|
||||
audioSink: { sendAudio: vi.fn() },
|
||||
});
|
||||
|
||||
expect(request?.autoRespondToAudio).toBe(false);
|
||||
});
|
||||
|
||||
it("can acknowledge provider marks without transport mark support", () => {
|
||||
let callbacks: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0] | undefined;
|
||||
const bridge = makeBridge();
|
||||
|
||||
@@ -41,6 +41,7 @@ export type RealtimeVoiceBridgeSessionParams = {
|
||||
audioSink: RealtimeVoiceAudioSink;
|
||||
instructions?: string;
|
||||
initialGreetingInstructions?: string;
|
||||
autoRespondToAudio?: boolean;
|
||||
markStrategy?: RealtimeVoiceMarkStrategy;
|
||||
triggerGreetingOnReady?: boolean;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
@@ -82,6 +83,7 @@ export function createRealtimeVoiceBridgeSession(
|
||||
providerConfig: params.providerConfig,
|
||||
audioFormat: params.audioFormat,
|
||||
instructions: params.instructions,
|
||||
autoRespondToAudio: params.autoRespondToAudio,
|
||||
tools: params.tools,
|
||||
onAudio: (audio) => {
|
||||
if (canSendAudio()) {
|
||||
|
||||
Reference in New Issue
Block a user