From 1634f91a35b5e140bfd7912fa69a44c8137e41a9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 10:56:08 +0100 Subject: [PATCH] fix: improve google meet twilio join sequencing --- CHANGELOG.md | 2 + docs/plugins/google-meet.md | 18 ++-- extensions/google-meet/src/runtime.ts | 2 +- .../src/voice-call-gateway.test.ts | 30 +++++-- .../google-meet/src/voice-call-gateway.ts | 70 +++++++++++++-- extensions/voice-call/index.test.ts | 85 ++++++++++++++++++- extensions/voice-call/index.ts | 22 ++++- 7 files changed, 204 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23178bcd48d..6588294388b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,8 @@ Docs: https://docs.openclaw.ai - Gateway/pricing: abort in-flight model pricing catalog fetches when Gateway shutdown stops the refresh loop, and avoid post-stop cache writes or refresh timers. Fixes #72208. Thanks @rzcq. - Codex/app-server: make startup retry cleanup ownership-aware so concurrent Codex lanes cannot close another lane's freshly restarted shared app-server client. Thanks @vincentkoc. - Google Meet/Twilio: report missing dial-in details during setup and explain that Twilio cannot join Meet URLs without a phone dial plan. +- Google Meet/Twilio: start the phone leg before sending Meet PIN DTMF, delay intro speech until after the post-connect dial sequence, and log each stage so operators can tell Twilio-leg audio from Meet-room audio. +- Voice Call: accept provider call IDs for gateway speak/continue requests and report ended-call state from history instead of returning a generic "Call not found" for stale calls. - Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427. - Plugins: clarify config-selected duplicate plugin override diagnostics and document manifest schema updates for bundled-plugin forks. Fixes #8582. Thanks @sachah. - CLI backends/Claude: make live-session JSONL turn caps bounded and configurable via `reliability.outputLimits`, raising the default guard for tool-heavy Claude CLI turns while preserving memory limits. Fixes #75838. Thanks @hcordoba840. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index 55edf7410e2..189edde6824 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -1548,19 +1548,21 @@ participant: - Run `openclaw voicecall tail` and check that Twilio webhooks are arriving at the Gateway. - Run `openclaw logs --follow` and look for the Twilio Meet sequence: Google - Meet delegates the join, Voice Call stores pre-connect DTMF TwiML, serves - that initial TwiML, then serves realtime TwiML and starts the realtime bridge - with `initialGreeting=queued`. + Meet delegates the join, Voice Call starts the phone leg, Google Meet waits + `voiceCall.dtmfDelayMs`, sends DTMF with `voicecall.dtmf`, waits + `voiceCall.postDtmfSpeechDelayMs`, then requests intro speech with + `voicecall.speak`. - Re-run `openclaw googlemeet setup --transport twilio`; a green setup check is required but does not prove the meeting PIN sequence is correct. - Confirm the dial-in number belongs to the same Meet invitation and region as the PIN. -- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for - example `wwww123456#`. +- Increase `voiceCall.dtmfDelayMs` if Meet answers slowly or the call transcript + still shows the prompt asking for a PIN after DTMF was sent. - If the participant joins but you do not hear the greeting, check - `openclaw logs --follow` for realtime TwiML, realtime bridge startup, and - `initialGreeting=queued`. The greeting is generated from the initial - `voicecall.start` message after the realtime bridge connects. + `openclaw logs --follow` for the post-DTMF `voicecall.speak` request and + either media-stream TTS playback or the Twilio `` fallback. If the call + transcript still contains "enter the meeting PIN", the phone leg has not joined + the Meet room yet, so meeting participants will not hear speech. If webhooks do not arrive, debug the Voice Call plugin first: the provider must reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel. diff --git a/extensions/google-meet/src/runtime.ts b/extensions/google-meet/src/runtime.ts index a796a0051f2..695c6598287 100644 --- a/extensions/google-meet/src/runtime.ts +++ b/extensions/google-meet/src/runtime.ts @@ -491,7 +491,7 @@ export class GoogleMeetRuntime { session.notes.push( this.params.config.voiceCall.enabled ? dtmfSequence - ? "Twilio transport delegated the call to the voice-call plugin and queued configured DTMF." + ? "Twilio transport delegated the phone leg to the voice-call plugin, then sent configured DTMF after connect before speaking." : "Twilio transport delegated the call to the voice-call plugin without configured DTMF." : "Twilio transport is an explicit dial plan; voice-call delegation is disabled.", ); diff --git a/extensions/google-meet/src/voice-call-gateway.test.ts b/extensions/google-meet/src/voice-call-gateway.test.ts index 6f248fe4efc..7f977f12a53 100644 --- a/extensions/google-meet/src/voice-call-gateway.test.ts +++ b/extensions/google-meet/src/voice-call-gateway.test.ts @@ -21,39 +21,59 @@ vi.mock("openclaw/plugin-sdk/gateway-runtime", () => ({ describe("Google Meet voice-call gateway", () => { beforeEach(() => { + vi.useRealTimers(); gatewayMocks.request.mockReset(); gatewayMocks.request.mockResolvedValue({ callId: "call-1" }); gatewayMocks.stopAndWait.mockClear(); gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear(); }); - it("starts Twilio Meet calls with pre-connect DTMF and intro metadata", async () => { + it("starts Twilio Meet calls, sends delayed DTMF, then speaks the intro", async () => { const config = resolveGoogleMeetConfig({ voiceCall: { gatewayUrl: "ws://127.0.0.1:18789", dtmfDelayMs: 1, + postDtmfSpeechDelayMs: 2, }, realtime: { introMessage: "Say exactly: I'm here and listening." }, }); - await joinMeetViaVoiceCallGateway({ + const join = joinMeetViaVoiceCallGateway({ config, dialInNumber: "+15551234567", dtmfSequence: "123456#", message: "Say exactly: I'm here and listening.", }); + await join; + expect(gatewayMocks.request).toHaveBeenNthCalledWith( 1, "voicecall.start", { to: "+15551234567", mode: "conversation", - message: "Say exactly: I'm here and listening.", - dtmfSequence: "123456#", }, { timeoutMs: 30_000 }, ); - expect(gatewayMocks.request).toHaveBeenCalledTimes(1); + expect(gatewayMocks.request).toHaveBeenNthCalledWith( + 2, + "voicecall.dtmf", + { + callId: "call-1", + digits: "123456#", + }, + { timeoutMs: 30_000 }, + ); + expect(gatewayMocks.request).toHaveBeenNthCalledWith( + 3, + "voicecall.speak", + { + callId: "call-1", + message: "Say exactly: I'm here and listening.", + }, + { timeoutMs: 30_000 }, + ); + expect(gatewayMocks.request).toHaveBeenCalledTimes(3); }); }); diff --git a/extensions/google-meet/src/voice-call-gateway.ts b/extensions/google-meet/src/voice-call-gateway.ts index 8ea930019fc..fd6fb7afa94 100644 --- a/extensions/google-meet/src/voice-call-gateway.ts +++ b/extensions/google-meet/src/voice-call-gateway.ts @@ -18,12 +18,24 @@ type VoiceCallSpeakResult = { error?: string; }; +type VoiceCallDtmfResult = { + success?: boolean; + error?: string; +}; + type VoiceCallMeetJoinResult = { callId: string; dtmfSent: boolean; introSent: boolean; }; +function sleep(ms: number): Promise { + if (ms <= 0) { + return Promise.resolve(); + } + return new Promise((resolve) => setTimeout(resolve, ms)); +} + async function createConnectedGatewayClient( config: GoogleMeetConfig, ): Promise { @@ -81,15 +93,13 @@ export async function joinMeetViaVoiceCallGateway(params: { try { client = await createConnectedGatewayClient(params.config); params.logger?.info( - `[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "yes" : "no"}, intro=${params.message ? "yes" : "no"})`, + `[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "post-connect" : "none"}, intro=${params.message ? "delayed" : "none"})`, ); const start = (await client.request( "voicecall.start", { to: params.dialInNumber, mode: "conversation", - ...(params.message ? { message: params.message } : {}), - ...(params.dtmfSequence ? { dtmfSequence: params.dtmfSequence } : {}), }, { timeoutMs: params.config.voiceCall.requestTimeoutMs }, )) as VoiceCallStartResult; @@ -97,12 +107,60 @@ export async function joinMeetViaVoiceCallGateway(params: { throw new Error(start.error || "voicecall.start did not return callId"); } params.logger?.info( - `[google-meet] Voice Call Twilio join started: callId=${start.callId} dtmf=${params.dtmfSequence ? "yes" : "no"} intro=${params.message ? "yes" : "no"}`, + `[google-meet] Voice Call Twilio phone leg started: callId=${start.callId}`, ); + let dtmfSent = false; + if (params.dtmfSequence) { + const delayMs = params.config.voiceCall.dtmfDelayMs; + params.logger?.info( + `[google-meet] Waiting ${delayMs}ms before sending Meet DTMF for callId=${start.callId}`, + ); + await sleep(delayMs); + const dtmf = (await client.request( + "voicecall.dtmf", + { + callId: start.callId, + digits: params.dtmfSequence, + }, + { timeoutMs: params.config.voiceCall.requestTimeoutMs }, + )) as VoiceCallDtmfResult; + if (dtmf.success === false) { + throw new Error(dtmf.error || "voicecall.dtmf failed"); + } + dtmfSent = true; + params.logger?.info( + `[google-meet] Meet DTMF sent after phone leg connected: callId=${start.callId} digits=${params.dtmfSequence.length}`, + ); + } + let introSent = false; + if (params.message) { + const delayMs = params.dtmfSequence ? params.config.voiceCall.postDtmfSpeechDelayMs : 0; + if (delayMs > 0) { + params.logger?.info( + `[google-meet] Waiting ${delayMs}ms after Meet DTMF before speaking intro for callId=${start.callId}`, + ); + await sleep(delayMs); + } + const spoken = (await client.request( + "voicecall.speak", + { + callId: start.callId, + message: params.message, + }, + { timeoutMs: params.config.voiceCall.requestTimeoutMs }, + )) as VoiceCallSpeakResult; + if (spoken.success === false) { + throw new Error(spoken.error || "voicecall.speak failed"); + } + introSent = true; + params.logger?.info( + `[google-meet] Intro speech requested after Meet dial sequence: callId=${start.callId}`, + ); + } return { callId: start.callId, - dtmfSent: Boolean(params.dtmfSequence), - introSent: Boolean(params.message), + dtmfSent, + introSent, }; } finally { await client?.stopAndWait({ timeoutMs: 1_000 }); diff --git a/extensions/voice-call/index.test.ts b/extensions/voice-call/index.test.ts index 8f58d3e4939..c6086c077f0 100644 --- a/extensions/voice-call/index.test.ts +++ b/extensions/voice-call/index.test.ts @@ -6,6 +6,7 @@ import { createTestPluginApi } from "openclaw/plugin-sdk/plugin-test-api"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawPluginApi } from "./api.js"; import type { VoiceCallRuntime } from "./runtime-entry.js"; +import type { CallRecord } from "./src/types.js"; let runtimeStub: VoiceCallRuntime; @@ -52,8 +53,12 @@ function captureStdout() { } function createRuntimeStub(callId = "call-1"): VoiceCallRuntime { + const call = createCallRecord({ callId }); return { - config: { toNumber: "+15550001234" } as VoiceCallRuntime["config"], + config: { + toNumber: "+15550001234", + realtime: { enabled: false }, + } as VoiceCallRuntime["config"], provider: {} as VoiceCallRuntime["provider"], manager: { initiateCall: vi.fn(async () => ({ callId, success: true })), @@ -64,17 +69,35 @@ function createRuntimeStub(callId = "call-1"): VoiceCallRuntime { speak: vi.fn(async () => ({ success: true })), sendDtmf: vi.fn(async () => ({ success: true })), endCall: vi.fn(async () => ({ success: true })), - getCall: vi.fn((id: string) => (id === callId ? { callId } : undefined)), + getCall: vi.fn((id: string) => (id === callId ? call : undefined)), getCallByProviderCallId: vi.fn(() => undefined), - getActiveCalls: vi.fn(() => [{ callId }]), + getActiveCalls: vi.fn(() => [call]), + getCallHistory: vi.fn(async () => []), } as unknown as VoiceCallRuntime["manager"], - webhookServer: {} as VoiceCallRuntime["webhookServer"], + webhookServer: { + speakRealtime: vi.fn(() => ({ success: false, error: "No active realtime bridge for call" })), + } as unknown as VoiceCallRuntime["webhookServer"], webhookUrl: "http://127.0.0.1:3334/voice/webhook", publicUrl: null, stop: vi.fn(async () => {}), }; } +function createCallRecord(overrides: Partial = {}): CallRecord { + return { + callId: "call-1", + provider: "mock", + direction: "outbound", + state: "active", + from: "+15550001111", + to: "+15550001234", + startedAt: Date.UTC(2026, 4, 2, 9, 0, 0), + transcript: [], + processedEventIds: [], + ...overrides, + }; +} + function createServiceContext(): Parameters["start"]>[0] { return { config: {}, @@ -397,6 +420,60 @@ describe("voice-call plugin", () => { expect(respond.mock.calls[0]).toEqual([true, { success: true }]); }); + it("normalizes provider call ids before speaking", async () => { + runtimeStub.manager.getCall = vi.fn(() => undefined); + runtimeStub.manager.getCallByProviderCallId = vi.fn(() => + createCallRecord({ + callId: "call-1", + providerCallId: "CA123", + }), + ); + const { methods } = setup({ provider: "mock" }); + const handler = methods.get("voicecall.speak") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); + + await handler?.({ params: { callId: "CA123", message: "hello" }, respond }); + + expect(runtimeStub.manager.speak).toHaveBeenCalledWith("call-1", "hello"); + expect(respond.mock.calls[0]).toEqual([true, { success: true }]); + }); + + it("reports ended call history when speaking to a stale call", async () => { + runtimeStub.manager.getCall = vi.fn(() => undefined); + runtimeStub.manager.getCallByProviderCallId = vi.fn(() => undefined); + runtimeStub.manager.getCallHistory = vi.fn(async () => [ + createCallRecord({ + callId: "call-1", + providerCallId: "CA123", + state: "completed", + endReason: "completed", + endedAt: Date.UTC(2026, 4, 2, 9, 18, 23), + }), + ]); + const { methods } = setup({ provider: "mock" }); + const handler = methods.get("voicecall.speak") as + | ((ctx: { + params: Record; + respond: ReturnType; + }) => Promise) + | undefined; + const respond = vi.fn(); + + await handler?.({ params: { callId: "CA123", message: "hello" }, respond }); + + const [ok, , error] = respond.mock.calls[0] ?? []; + expect(ok).toBe(false); + expect(error.message).toContain("call is not active"); + expect(error.message).toContain("last state=completed"); + expect(error.message).toContain("endReason=completed"); + expect(runtimeStub.manager.speak).not.toHaveBeenCalled(); + }); + it("normalizes legacy config through runtime creation and warns to run doctor", async () => { const { methods } = setup({ enabled: true, diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index 0edd5779aae..a97c779e740 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -302,6 +302,22 @@ export default definePluginEntry({ respondError(respond, formatErrorMessage(err)); }; + const describeHistoricalCall = async (rt: VoiceCallRuntime, callId: string) => { + const history = await rt.manager.getCallHistory(100); + const call = history + .toReversed() + .find((candidate) => candidate.callId === callId || candidate.providerCallId === callId); + if (!call) { + return undefined; + } + const details = [ + `last state=${call.state}`, + call.endReason ? `endReason=${call.endReason}` : undefined, + call.endedAt ? `endedAt=${new Date(call.endedAt).toISOString()}` : undefined, + ].filter(Boolean); + return `call is not active (${details.join(", ")})`; + }; + const resolveCallMessageRequest = async (params: GatewayRequestHandlerOptions["params"]) => { const callId = normalizeOptionalString(params?.callId) ?? ""; const message = normalizeOptionalString(params?.message) ?? ""; @@ -309,7 +325,11 @@ export default definePluginEntry({ return { error: "callId and message required" } as const; } const rt = await ensureRuntime(); - return { rt, callId, message } as const; + const activeCall = rt.manager.getCall(callId) ?? rt.manager.getCallByProviderCallId(callId); + if (activeCall) { + return { rt, callId: activeCall.callId, message } as const; + } + return { error: (await describeHistoricalCall(rt, callId)) ?? "Call not found" } as const; }; const initiateCallAndRespond = async (params: {