From ae07d57f9d5ffebe56bbe3981afe9cf92aefbd14 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 1 May 2026 07:04:53 +0100 Subject: [PATCH] fix: sequence meet dtmf before realtime bridge --- CHANGELOG.md | 2 +- docs/plugins/google-meet.md | 11 +++-- docs/plugins/voice-call.md | 8 +-- extensions/google-meet/index.ts | 9 +++- extensions/google-meet/openclaw.plugin.json | 3 +- .../src/voice-call-gateway.test.ts | 22 ++------- .../google-meet/src/voice-call-gateway.ts | 28 +---------- extensions/voice-call/index.test.ts | 8 ++- extensions/voice-call/index.ts | 8 +++ .../voice-call/src/manager/outbound.test.ts | 49 +++++++++++++++++++ extensions/voice-call/src/manager/outbound.ts | 13 ++++- extensions/voice-call/src/manager/twiml.ts | 8 +++ .../voice-call/src/providers/twilio.test.ts | 35 +++++++++++++ extensions/voice-call/src/providers/twilio.ts | 6 ++- extensions/voice-call/src/types.ts | 2 + 15 files changed, 151 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2adc155439b..57cc8788a30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ Docs: https://docs.openclaw.ai ### Fixes -- Google Meet/Voice Call: defer Twilio dial-in intro speech until after Meet DTMF entry and route delayed speech through the active realtime Voice Call bridge. Thanks @donkeykong91 and @PfanP. +- Google Meet/Voice Call: play Twilio Meet DTMF before opening the realtime media stream and carry the intro as the initial Voice Call message, so the greeting is generated after Meet admits the phone participant instead of racing a live-call TwiML update. Thanks @donkeykong91 and @PfanP. - Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs before joins. Thanks @donkeykong91 and @PfanP. - Voice Call/Twilio: retry transient 21220 live-call TwiML updates and catch answered-path initial-greeting failures, so a fast answered callback no longer crashes the Gateway or drops the Twilio greeting/listen transition. (#74606) Thanks @Sivan22. - Voice Call/Twilio: register accepted media streams immediately but wait for realtime transcription readiness before speaking the initial greeting, so reconnect grace handling stays live while OpenAI STT startup is no longer starved by TTS. Fixes #75197. (#75257) Thanks @donkeykong91 and @PfanP. diff --git a/docs/plugins/google-meet.md b/docs/plugins/google-meet.md index aea755b023d..c5fac42a534 100644 --- a/docs/plugins/google-meet.md +++ b/docs/plugins/google-meet.md @@ -981,7 +981,9 @@ Twilio-only config: ``` `voiceCall.enabled` defaults to `true`; with Twilio transport it delegates the -actual PSTN call and DTMF to the Voice Call plugin. If `voice-call` is not +actual PSTN call, DTMF, and intro greeting to the Voice Call plugin. Voice Call +plays the DTMF sequence before opening the realtime media stream, then uses the +saved intro text as the initial realtime greeting. If `voice-call` is not enabled, Google Meet can still validate and record the dial plan, but it cannot place the Twilio call. @@ -1411,9 +1413,10 @@ participant: the PIN. - Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for example `wwww123456#`. -- If the participant joins but you miss the first spoken line, increase - `plugins.entries.google-meet.config.voiceCall.postDtmfSpeechDelayMs` so the - intro is spoken after Meet finishes admitting the phone participant. +- If the participant joins but you do not hear the greeting, check + `openclaw voicecall tail` for a Twilio stream start followed by realtime + provider readiness. The greeting is now generated from the initial + `voicecall.start` message after the stream connects. If webhooks do not arrive, debug the Voice Call plugin first: the provider must reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel. diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index be59df2f39f..c91301a15f7 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -766,10 +766,10 @@ If Voice Call is green but the Meet participant never joins, check the Meet dial-in number, PIN, and `--dtmf-sequence`. The phone call can be healthy while the meeting rejects or ignores an incorrect DTMF sequence. -Google Meet starts Voice Call silently, sends DTMF, then asks Voice Call to -speak the intro after `voiceCall.postDtmfSpeechDelayMs`. Increase that delay in -the Google Meet plugin config if the first line is spoken before Meet admits the -phone participant. +Google Meet passes the Meet DTMF sequence and intro text to `voicecall.start`. +For Twilio calls, Voice Call serves the DTMF TwiML first, redirects back to the +webhook, then opens the realtime media stream so the saved intro is generated +after the phone participant has joined the meeting. ### Realtime call has no speech diff --git a/extensions/google-meet/index.ts b/extensions/google-meet/index.ts index 5f58c2f82b9..717c320b99e 100644 --- a/extensions/google-meet/index.ts +++ b/extensions/google-meet/index.ts @@ -118,9 +118,14 @@ const googleMeetConfigSchema = { label: "Voice Call Request Timeout (ms)", advanced: true, }, - "voiceCall.dtmfDelayMs": { label: "DTMF Delay (ms)", advanced: true }, + "voiceCall.dtmfDelayMs": { + label: "Legacy DTMF Delay (ms)", + help: "Compatibility setting from the old post-connect DTMF flow. Twilio Meet joins now play DTMF before realtime connect.", + advanced: true, + }, "voiceCall.postDtmfSpeechDelayMs": { - label: "Post-DTMF Speech Delay (ms)", + label: "Legacy Post-DTMF Speech Delay (ms)", + help: "Compatibility setting from the old delayed-speech flow. Twilio Meet joins now carry the intro as the initial Voice Call message.", advanced: true, }, "voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true }, diff --git a/extensions/google-meet/openclaw.plugin.json b/extensions/google-meet/openclaw.plugin.json index 5166ee3f912..bf048fb811f 100644 --- a/extensions/google-meet/openclaw.plugin.json +++ b/extensions/google-meet/openclaw.plugin.json @@ -112,7 +112,8 @@ "advanced": true }, "voiceCall.dtmfDelayMs": { - "label": "DTMF Delay (ms)", + "label": "Legacy DTMF Delay (ms)", + "help": "Compatibility setting from the old post-connect DTMF flow. Twilio Meet joins now play DTMF before realtime connect.", "advanced": true }, "voiceCall.introMessage": { diff --git a/extensions/google-meet/src/voice-call-gateway.test.ts b/extensions/google-meet/src/voice-call-gateway.test.ts index 6aadaf4e500..6f248fe4efc 100644 --- a/extensions/google-meet/src/voice-call-gateway.test.ts +++ b/extensions/google-meet/src/voice-call-gateway.test.ts @@ -27,12 +27,11 @@ describe("Google Meet voice-call gateway", () => { gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear(); }); - it("starts Twilio Meet calls silently, sends DTMF, then speaks the realtime intro", async () => { + it("starts Twilio Meet calls with pre-connect DTMF and intro metadata", async () => { const config = resolveGoogleMeetConfig({ voiceCall: { gatewayUrl: "ws://127.0.0.1:18789", dtmfDelayMs: 1, - postDtmfSpeechDelayMs: 1, }, realtime: { introMessage: "Say exactly: I'm here and listening." }, }); @@ -50,26 +49,11 @@ describe("Google Meet voice-call gateway", () => { { to: "+15551234567", mode: "conversation", - }, - { timeoutMs: 30_000 }, - ); - expect(gatewayMocks.request).toHaveBeenNthCalledWith( - 2, - "voicecall.dtmf", - { - callId: "call-1", - digits: "123456#", - }, - { timeoutMs: 30_000 }, - ); - expect(gatewayMocks.request).toHaveBeenNthCalledWith( - 3, - "voicecall.speak", - { - callId: "call-1", message: "Say exactly: I'm here and listening.", + dtmfSequence: "123456#", }, { timeoutMs: 30_000 }, ); + expect(gatewayMocks.request).toHaveBeenCalledTimes(1); }); }); diff --git a/extensions/google-meet/src/voice-call-gateway.ts b/extensions/google-meet/src/voice-call-gateway.ts index 84224d663b6..fa7f955bb44 100644 --- a/extensions/google-meet/src/voice-call-gateway.ts +++ b/extensions/google-meet/src/voice-call-gateway.ts @@ -1,4 +1,3 @@ -import { setTimeout as sleep } from "node:timers/promises"; import { GatewayClient, startGatewayClientWhenEventLoopReady, @@ -84,37 +83,14 @@ export async function joinMeetViaVoiceCallGateway(params: { { to: params.dialInNumber, mode: "conversation", + ...(params.message ? { message: params.message } : {}), + ...(params.dtmfSequence ? { dtmfSequence: params.dtmfSequence } : {}), }, { timeoutMs: params.config.voiceCall.requestTimeoutMs }, )) as VoiceCallStartResult; if (!start.callId) { throw new Error(start.error || "voicecall.start did not return callId"); } - if (params.dtmfSequence) { - await sleep(params.config.voiceCall.dtmfDelayMs); - await client.request( - "voicecall.dtmf", - { - callId: start.callId, - digits: params.dtmfSequence, - }, - { timeoutMs: params.config.voiceCall.requestTimeoutMs }, - ); - } - if (params.message) { - await sleep(params.config.voiceCall.postDtmfSpeechDelayMs); - const spoken = (await client.request( - "voicecall.speak", - { - callId: start.callId, - message: params.message, - }, - { timeoutMs: params.config.voiceCall.requestTimeoutMs }, - )) as VoiceCallSpeakResult; - if (spoken.success === false) { - throw new Error(spoken.error || "voicecall.speak failed"); - } - } return { callId: start.callId, dtmfSent: Boolean(params.dtmfSequence), diff --git a/extensions/voice-call/index.test.ts b/extensions/voice-call/index.test.ts index 99a5ce2f001..08c3812a122 100644 --- a/extensions/voice-call/index.test.ts +++ b/extensions/voice-call/index.test.ts @@ -325,10 +325,16 @@ describe("voice-call plugin", () => { | undefined; const respond = vi.fn(); await handler?.({ - params: { message: "Hi", mode: "conversation", to: "+15550001234" }, + params: { + dtmfSequence: "ww123456#", + message: "Hi", + mode: "conversation", + to: "+15550001234", + }, respond, }); expect(runtimeStub.manager.initiateCall).toHaveBeenCalledWith("+15550001234", undefined, { + dtmfSequence: "ww123456#", message: "Hi", mode: "conversation", }); diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index d60d8576f00..c094b843c7a 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -121,6 +121,7 @@ const VoiceCallToolSchema = Type.Union([ to: Type.Optional(Type.String({ description: "Call target" })), message: Type.String({ description: "Intro message" }), mode: Type.Optional(Type.Union([Type.Literal("notify"), Type.Literal("conversation")])), + dtmfSequence: Type.Optional(Type.String({ description: "DTMF digits to play before connect" })), }), Type.Object({ action: Type.Literal("continue_call"), @@ -150,6 +151,7 @@ const VoiceCallToolSchema = Type.Union([ to: Type.Optional(Type.String({ description: "Call target" })), sid: Type.Optional(Type.String({ description: "Call SID" })), message: Type.Optional(Type.String({ description: "Optional intro message" })), + dtmfSequence: Type.Optional(Type.String({ description: "DTMF digits to play before connect" })), }), ]); @@ -275,10 +277,12 @@ export default definePluginEntry({ to: string; message?: string; mode?: "notify" | "conversation"; + dtmfSequence?: string; }) => { const result = await params.rt.manager.initiateCall(params.to, undefined, { message: params.message, mode: params.mode, + dtmfSequence: params.dtmfSequence, }); if (!result.success) { params.respond(false, { error: result.error || "initiate failed" }); @@ -470,6 +474,7 @@ export default definePluginEntry({ try { const to = normalizeOptionalString(params?.to) ?? ""; const message = normalizeOptionalString(params?.message) ?? ""; + const dtmfSequence = normalizeOptionalString(params?.dtmfSequence); if (!to) { respond(false, { error: "to required" }); return; @@ -483,6 +488,7 @@ export default definePluginEntry({ to, message: message || undefined, mode, + dtmfSequence, }); } catch (err) { sendError(respond, err); @@ -518,6 +524,7 @@ export default definePluginEntry({ } const result = await rt.manager.initiateCall(to, undefined, { message, + dtmfSequence: normalizeOptionalString(rawParams.dtmfSequence), mode: rawParams.mode === "notify" || rawParams.mode === "conversation" ? rawParams.mode @@ -602,6 +609,7 @@ export default definePluginEntry({ throw new Error("to required for call"); } const result = await rt.manager.initiateCall(to, undefined, { + dtmfSequence: normalizeOptionalString(rawParams.dtmfSequence), message: normalizeOptionalString(rawParams.message), }); if (!result.success) { diff --git a/extensions/voice-call/src/manager/outbound.test.ts b/extensions/voice-call/src/manager/outbound.test.ts index bd32794ff8e..6555ea9f647 100644 --- a/extensions/voice-call/src/manager/outbound.test.ts +++ b/extensions/voice-call/src/manager/outbound.test.ts @@ -3,6 +3,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; const { addTranscriptEntryMock, clearMaxDurationTimerMock, + generateDtmfRedirectTwimlMock, generateNotifyTwimlMock, getCallByProviderCallIdMock, mapVoiceToPollyMock, @@ -12,6 +13,7 @@ const { } = vi.hoisted(() => ({ addTranscriptEntryMock: vi.fn(), clearMaxDurationTimerMock: vi.fn(), + generateDtmfRedirectTwimlMock: vi.fn(), generateNotifyTwimlMock: vi.fn(), getCallByProviderCallIdMock: vi.fn(), mapVoiceToPollyMock: vi.fn(), @@ -45,6 +47,7 @@ vi.mock("../voice-mapping.js", () => ({ })); vi.mock("./twiml.js", () => ({ + generateDtmfRedirectTwiml: generateDtmfRedirectTwimlMock, generateNotifyTwiml: generateNotifyTwimlMock, })); @@ -69,6 +72,7 @@ describe("voice-call outbound helpers", () => { beforeEach(() => { vi.clearAllMocks(); mapVoiceToPollyMock.mockReturnValue("Polly.Joanna"); + generateDtmfRedirectTwimlMock.mockReturnValue(""); generateNotifyTwimlMock.mockReturnValue(""); }); @@ -169,6 +173,51 @@ describe("voice-call outbound helpers", () => { expect(persistCallRecordMock).toHaveBeenCalledTimes(2); }); + it("initiates conversation calls with pre-connect DTMF TwiML", async () => { + const initiateProviderCall = vi.fn(async () => ({ providerCallId: "provider-1" })); + const ctx = { + activeCalls: new Map(), + providerCallIdMap: new Map(), + provider: { name: "twilio", initiateCall: initiateProviderCall }, + config: { + maxConcurrentCalls: 3, + outbound: { defaultMode: "conversation" }, + fromNumber: "+14155550100", + }, + storePath: "/tmp/voice-call.json", + webhookUrl: "https://example.com/webhook", + }; + + const result = await initiateCall(ctx as never, "+14155550123", "session-1", { + mode: "conversation", + message: "hello meet", + dtmfSequence: "ww123456#", + }); + + expect(result).toEqual({ + callId: expect.any(String), + success: true, + }); + const callId = result.callId; + + expect(generateDtmfRedirectTwimlMock).toHaveBeenCalledWith( + "ww123456#", + "https://example.com/webhook", + ); + expect(initiateProviderCall).toHaveBeenCalledWith({ + callId, + from: "+14155550100", + to: "+14155550123", + webhookUrl: "https://example.com/webhook", + inlineTwiml: undefined, + preConnectTwiml: "", + }); + expect(ctx.activeCalls.get(callId)?.metadata).toMatchObject({ + initialMessage: "hello meet", + mode: "conversation", + }); + }); + it("fails initiateCall cleanly when provider initiation throws", async () => { const ctx = { activeCalls: new Map(), diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts index c1678498a35..61841f28d1a 100644 --- a/extensions/voice-call/src/manager/outbound.ts +++ b/extensions/voice-call/src/manager/outbound.ts @@ -16,7 +16,7 @@ import { getCallByProviderCallId } from "./lookup.js"; import { addTranscriptEntry, transitionState } from "./state.js"; import { persistCallRecord } from "./store.js"; import { clearTranscriptWaiter, waitForFinalTranscript } from "./timers.js"; -import { generateNotifyTwiml } from "./twiml.js"; +import { generateDtmfRedirectTwiml, generateNotifyTwiml } from "./twiml.js"; type InitiateContext = Pick< CallManagerContext, @@ -118,6 +118,13 @@ export async function initiateCall( typeof options === "string" ? { message: options } : (options ?? {}); const initialMessage = opts.message; const mode = opts.mode ?? ctx.config.outbound.defaultMode; + const dtmfSequence = opts.dtmfSequence; + if (dtmfSequence) { + const validationError = validateDtmfDigits(dtmfSequence); + if (validationError) { + return { callId: "", success: false, error: validationError }; + } + } if (!ctx.provider) { return { callId: "", success: false, error: "Provider not initialized" }; @@ -164,10 +171,13 @@ export async function initiateCall( try { // For notify mode with a message, use inline TwiML with . let inlineTwiml: string | undefined; + let preConnectTwiml: string | undefined; if (mode === "notify" && initialMessage) { const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config)); inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice); console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`); + } else if (dtmfSequence) { + preConnectTwiml = generateDtmfRedirectTwiml(dtmfSequence, ctx.webhookUrl); } const result = await ctx.provider.initiateCall({ @@ -176,6 +186,7 @@ export async function initiateCall( to, webhookUrl: ctx.webhookUrl, inlineTwiml, + preConnectTwiml, }); callRecord.providerCallId = result.providerCallId; diff --git a/extensions/voice-call/src/manager/twiml.ts b/extensions/voice-call/src/manager/twiml.ts index 588df559057..1e20d652b92 100644 --- a/extensions/voice-call/src/manager/twiml.ts +++ b/extensions/voice-call/src/manager/twiml.ts @@ -7,3 +7,11 @@ export function generateNotifyTwiml(message: string, voice: string): string { `; } + +export function generateDtmfRedirectTwiml(digits: string, webhookUrl: string): string { + return ` + + + ${escapeXml(webhookUrl)} +`; +} diff --git a/extensions/voice-call/src/providers/twilio.test.ts b/extensions/voice-call/src/providers/twilio.test.ts index 535f0921eaa..a024ba532ef 100644 --- a/extensions/voice-call/src/providers/twilio.test.ts +++ b/extensions/voice-call/src/providers/twilio.test.ts @@ -99,6 +99,41 @@ describe("TwilioProvider", () => { expectStreamingTwiml(requireResponseBody(result.providerResponseBody)); }); + it("serves pre-connect TwiML once before outbound streaming starts", async () => { + const provider = createProvider(); + ( + provider as unknown as { + apiRequest: TwilioApiRequest; + } + ).apiRequest = vi.fn(async () => ({ + sid: "CA999", + status: "queued", + })); + const preConnectTwiml = ''; + + await provider.initiateCall({ + callId: "call-1", + from: "+15550000001", + to: "+15550000002", + webhookUrl: "https://example.ngrok.app/voice/twilio", + preConnectTwiml, + }); + + const first = provider.parseWebhookEvent( + createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA999", { + callId: "call-1", + }), + ); + expect(requireResponseBody(first.providerResponseBody)).toBe(preConnectTwiml); + + const second = provider.parseWebhookEvent( + createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA999", { + callId: "call-1", + }), + ); + expectStreamingTwiml(requireResponseBody(second.providerResponseBody)); + }); + it("returns empty TwiML for status callbacks", () => { const provider = createProvider(); const ctx = createContext("CallStatus=ringing&Direction=outbound-api", { diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index cde782fd802..3bbdee45a04 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -516,8 +516,8 @@ export class TwilioProvider implements VoiceCallProvider { /** * Initiate an outbound call via Twilio API. - * If inlineTwiml is provided, uses that directly (for notify mode). - * Otherwise, uses webhook URL for dynamic TwiML. + * If inlineTwiml or preConnectTwiml is provided, the first webhook request + * receives that TwiML before normal dynamic TwiML resumes. */ async initiateCall(input: InitiateCallInput): Promise { const url = new URL(input.webhookUrl); @@ -533,6 +533,8 @@ export class TwilioProvider implements VoiceCallProvider { if (input.inlineTwiml) { this.twimlStorage.set(input.callId, input.inlineTwiml); this.notifyCalls.add(input.callId); + } else if (input.preConnectTwiml) { + this.twimlStorage.set(input.callId, input.preConnectTwiml); } // Build request params - always use URL-based TwiML. diff --git a/extensions/voice-call/src/types.ts b/extensions/voice-call/src/types.ts index 25549919cae..94604a5cf02 100644 --- a/extensions/voice-call/src/types.ts +++ b/extensions/voice-call/src/types.ts @@ -214,6 +214,8 @@ export type InitiateCallInput = { clientState?: Record; /** Inline TwiML to execute (skips webhook, used for notify mode) */ inlineTwiml?: string; + /** TwiML to serve once before normal webhook-driven call handling resumes. */ + preConnectTwiml?: string; }; export type InitiateCallResult = {