fix: sequence meet dtmf before realtime bridge

This commit is contained in:
Peter Steinberger
2026-05-01 07:04:53 +01:00
parent 42d73fd955
commit ae07d57f9d
15 changed files with 151 additions and 61 deletions

View File

@@ -13,7 +13,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Google Meet/Voice Call: defer Twilio dial-in intro speech until after Meet DTMF entry and route delayed speech through the active realtime Voice Call bridge. Thanks @donkeykong91 and @PfanP.
- Google Meet/Voice Call: play Twilio Meet DTMF before opening the realtime media stream and carry the intro as the initial Voice Call message, so the greeting is generated after Meet admits the phone participant instead of racing a live-call TwiML update. Thanks @donkeykong91 and @PfanP.
- Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs before joins. Thanks @donkeykong91 and @PfanP.
- Voice Call/Twilio: retry transient 21220 live-call TwiML updates and catch answered-path initial-greeting failures, so a fast answered callback no longer crashes the Gateway or drops the Twilio greeting/listen transition. (#74606) Thanks @Sivan22.
- Voice Call/Twilio: register accepted media streams immediately but wait for realtime transcription readiness before speaking the initial greeting, so reconnect grace handling stays live while OpenAI STT startup is no longer starved by TTS. Fixes #75197. (#75257) Thanks @donkeykong91 and @PfanP.

View File

@@ -981,7 +981,9 @@ Twilio-only config:
```
`voiceCall.enabled` defaults to `true`; with Twilio transport it delegates the
actual PSTN call and DTMF to the Voice Call plugin. If `voice-call` is not
actual PSTN call, DTMF, and intro greeting to the Voice Call plugin. Voice Call
plays the DTMF sequence before opening the realtime media stream, then uses the
saved intro text as the initial realtime greeting. If `voice-call` is not
enabled, Google Meet can still validate and record the dial plan, but it cannot
place the Twilio call.
@@ -1411,9 +1413,10 @@ participant:
the PIN.
- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for
example `wwww123456#`.
- If the participant joins but you miss the first spoken line, increase
`plugins.entries.google-meet.config.voiceCall.postDtmfSpeechDelayMs` so the
intro is spoken after Meet finishes admitting the phone participant.
- If the participant joins but you do not hear the greeting, check
`openclaw voicecall tail` for a Twilio stream start followed by realtime
provider readiness. The greeting is now generated from the initial
`voicecall.start` message after the stream connects.
If webhooks do not arrive, debug the Voice Call plugin first: the provider must
reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel.

View File

@@ -766,10 +766,10 @@ If Voice Call is green but the Meet participant never joins, check the Meet
dial-in number, PIN, and `--dtmf-sequence`. The phone call can be healthy while
the meeting rejects or ignores an incorrect DTMF sequence.
Google Meet starts Voice Call silently, sends DTMF, then asks Voice Call to
speak the intro after `voiceCall.postDtmfSpeechDelayMs`. Increase that delay in
the Google Meet plugin config if the first line is spoken before Meet admits the
phone participant.
Google Meet passes the Meet DTMF sequence and intro text to `voicecall.start`.
For Twilio calls, Voice Call serves the DTMF TwiML first, redirects back to the
webhook, then opens the realtime media stream so the saved intro is generated
after the phone participant has joined the meeting.
### Realtime call has no speech

View File

@@ -118,9 +118,14 @@ const googleMeetConfigSchema = {
label: "Voice Call Request Timeout (ms)",
advanced: true,
},
"voiceCall.dtmfDelayMs": { label: "DTMF Delay (ms)", advanced: true },
"voiceCall.dtmfDelayMs": {
label: "Legacy DTMF Delay (ms)",
help: "Compatibility setting from the old post-connect DTMF flow. Twilio Meet joins now play DTMF before realtime connect.",
advanced: true,
},
"voiceCall.postDtmfSpeechDelayMs": {
label: "Post-DTMF Speech Delay (ms)",
label: "Legacy Post-DTMF Speech Delay (ms)",
help: "Compatibility setting from the old delayed-speech flow. Twilio Meet joins now carry the intro as the initial Voice Call message.",
advanced: true,
},
"voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true },

View File

@@ -112,7 +112,8 @@
"advanced": true
},
"voiceCall.dtmfDelayMs": {
"label": "DTMF Delay (ms)",
"label": "Legacy DTMF Delay (ms)",
"help": "Compatibility setting from the old post-connect DTMF flow. Twilio Meet joins now play DTMF before realtime connect.",
"advanced": true
},
"voiceCall.introMessage": {

View File

@@ -27,12 +27,11 @@ describe("Google Meet voice-call gateway", () => {
gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
});
it("starts Twilio Meet calls silently, sends DTMF, then speaks the realtime intro", async () => {
it("starts Twilio Meet calls with pre-connect DTMF and intro metadata", async () => {
const config = resolveGoogleMeetConfig({
voiceCall: {
gatewayUrl: "ws://127.0.0.1:18789",
dtmfDelayMs: 1,
postDtmfSpeechDelayMs: 1,
},
realtime: { introMessage: "Say exactly: I'm here and listening." },
});
@@ -50,26 +49,11 @@ describe("Google Meet voice-call gateway", () => {
{
to: "+15551234567",
mode: "conversation",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
2,
"voicecall.dtmf",
{
callId: "call-1",
digits: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
3,
"voicecall.speak",
{
callId: "call-1",
message: "Say exactly: I'm here and listening.",
dtmfSequence: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenCalledTimes(1);
});
});

View File

@@ -1,4 +1,3 @@
import { setTimeout as sleep } from "node:timers/promises";
import {
GatewayClient,
startGatewayClientWhenEventLoopReady,
@@ -84,37 +83,14 @@ export async function joinMeetViaVoiceCallGateway(params: {
{
to: params.dialInNumber,
mode: "conversation",
...(params.message ? { message: params.message } : {}),
...(params.dtmfSequence ? { dtmfSequence: params.dtmfSequence } : {}),
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallStartResult;
if (!start.callId) {
throw new Error(start.error || "voicecall.start did not return callId");
}
if (params.dtmfSequence) {
await sleep(params.config.voiceCall.dtmfDelayMs);
await client.request(
"voicecall.dtmf",
{
callId: start.callId,
digits: params.dtmfSequence,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
);
}
if (params.message) {
await sleep(params.config.voiceCall.postDtmfSpeechDelayMs);
const spoken = (await client.request(
"voicecall.speak",
{
callId: start.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
}
return {
callId: start.callId,
dtmfSent: Boolean(params.dtmfSequence),

View File

@@ -325,10 +325,16 @@ describe("voice-call plugin", () => {
| undefined;
const respond = vi.fn();
await handler?.({
params: { message: "Hi", mode: "conversation", to: "+15550001234" },
params: {
dtmfSequence: "ww123456#",
message: "Hi",
mode: "conversation",
to: "+15550001234",
},
respond,
});
expect(runtimeStub.manager.initiateCall).toHaveBeenCalledWith("+15550001234", undefined, {
dtmfSequence: "ww123456#",
message: "Hi",
mode: "conversation",
});

View File

@@ -121,6 +121,7 @@ const VoiceCallToolSchema = Type.Union([
to: Type.Optional(Type.String({ description: "Call target" })),
message: Type.String({ description: "Intro message" }),
mode: Type.Optional(Type.Union([Type.Literal("notify"), Type.Literal("conversation")])),
dtmfSequence: Type.Optional(Type.String({ description: "DTMF digits to play before connect" })),
}),
Type.Object({
action: Type.Literal("continue_call"),
@@ -150,6 +151,7 @@ const VoiceCallToolSchema = Type.Union([
to: Type.Optional(Type.String({ description: "Call target" })),
sid: Type.Optional(Type.String({ description: "Call SID" })),
message: Type.Optional(Type.String({ description: "Optional intro message" })),
dtmfSequence: Type.Optional(Type.String({ description: "DTMF digits to play before connect" })),
}),
]);
@@ -275,10 +277,12 @@ export default definePluginEntry({
to: string;
message?: string;
mode?: "notify" | "conversation";
dtmfSequence?: string;
}) => {
const result = await params.rt.manager.initiateCall(params.to, undefined, {
message: params.message,
mode: params.mode,
dtmfSequence: params.dtmfSequence,
});
if (!result.success) {
params.respond(false, { error: result.error || "initiate failed" });
@@ -470,6 +474,7 @@ export default definePluginEntry({
try {
const to = normalizeOptionalString(params?.to) ?? "";
const message = normalizeOptionalString(params?.message) ?? "";
const dtmfSequence = normalizeOptionalString(params?.dtmfSequence);
if (!to) {
respond(false, { error: "to required" });
return;
@@ -483,6 +488,7 @@ export default definePluginEntry({
to,
message: message || undefined,
mode,
dtmfSequence,
});
} catch (err) {
sendError(respond, err);
@@ -518,6 +524,7 @@ export default definePluginEntry({
}
const result = await rt.manager.initiateCall(to, undefined, {
message,
dtmfSequence: normalizeOptionalString(rawParams.dtmfSequence),
mode:
rawParams.mode === "notify" || rawParams.mode === "conversation"
? rawParams.mode
@@ -602,6 +609,7 @@ export default definePluginEntry({
throw new Error("to required for call");
}
const result = await rt.manager.initiateCall(to, undefined, {
dtmfSequence: normalizeOptionalString(rawParams.dtmfSequence),
message: normalizeOptionalString(rawParams.message),
});
if (!result.success) {

View File

@@ -3,6 +3,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest";
const {
addTranscriptEntryMock,
clearMaxDurationTimerMock,
generateDtmfRedirectTwimlMock,
generateNotifyTwimlMock,
getCallByProviderCallIdMock,
mapVoiceToPollyMock,
@@ -12,6 +13,7 @@ const {
} = vi.hoisted(() => ({
addTranscriptEntryMock: vi.fn(),
clearMaxDurationTimerMock: vi.fn(),
generateDtmfRedirectTwimlMock: vi.fn(),
generateNotifyTwimlMock: vi.fn(),
getCallByProviderCallIdMock: vi.fn(),
mapVoiceToPollyMock: vi.fn(),
@@ -45,6 +47,7 @@ vi.mock("../voice-mapping.js", () => ({
}));
vi.mock("./twiml.js", () => ({
generateDtmfRedirectTwiml: generateDtmfRedirectTwimlMock,
generateNotifyTwiml: generateNotifyTwimlMock,
}));
@@ -69,6 +72,7 @@ describe("voice-call outbound helpers", () => {
beforeEach(() => {
vi.clearAllMocks();
mapVoiceToPollyMock.mockReturnValue("Polly.Joanna");
generateDtmfRedirectTwimlMock.mockReturnValue("<DtmfRedirect />");
generateNotifyTwimlMock.mockReturnValue("<Response />");
});
@@ -169,6 +173,51 @@ describe("voice-call outbound helpers", () => {
expect(persistCallRecordMock).toHaveBeenCalledTimes(2);
});
it("initiates conversation calls with pre-connect DTMF TwiML", async () => {
const initiateProviderCall = vi.fn(async () => ({ providerCallId: "provider-1" }));
const ctx = {
activeCalls: new Map(),
providerCallIdMap: new Map(),
provider: { name: "twilio", initiateCall: initiateProviderCall },
config: {
maxConcurrentCalls: 3,
outbound: { defaultMode: "conversation" },
fromNumber: "+14155550100",
},
storePath: "/tmp/voice-call.json",
webhookUrl: "https://example.com/webhook",
};
const result = await initiateCall(ctx as never, "+14155550123", "session-1", {
mode: "conversation",
message: "hello meet",
dtmfSequence: "ww123456#",
});
expect(result).toEqual({
callId: expect.any(String),
success: true,
});
const callId = result.callId;
expect(generateDtmfRedirectTwimlMock).toHaveBeenCalledWith(
"ww123456#",
"https://example.com/webhook",
);
expect(initiateProviderCall).toHaveBeenCalledWith({
callId,
from: "+14155550100",
to: "+14155550123",
webhookUrl: "https://example.com/webhook",
inlineTwiml: undefined,
preConnectTwiml: "<DtmfRedirect />",
});
expect(ctx.activeCalls.get(callId)?.metadata).toMatchObject({
initialMessage: "hello meet",
mode: "conversation",
});
});
it("fails initiateCall cleanly when provider initiation throws", async () => {
const ctx = {
activeCalls: new Map(),

View File

@@ -16,7 +16,7 @@ import { getCallByProviderCallId } from "./lookup.js";
import { addTranscriptEntry, transitionState } from "./state.js";
import { persistCallRecord } from "./store.js";
import { clearTranscriptWaiter, waitForFinalTranscript } from "./timers.js";
import { generateNotifyTwiml } from "./twiml.js";
import { generateDtmfRedirectTwiml, generateNotifyTwiml } from "./twiml.js";
type InitiateContext = Pick<
CallManagerContext,
@@ -118,6 +118,13 @@ export async function initiateCall(
typeof options === "string" ? { message: options } : (options ?? {});
const initialMessage = opts.message;
const mode = opts.mode ?? ctx.config.outbound.defaultMode;
const dtmfSequence = opts.dtmfSequence;
if (dtmfSequence) {
const validationError = validateDtmfDigits(dtmfSequence);
if (validationError) {
return { callId: "", success: false, error: validationError };
}
}
if (!ctx.provider) {
return { callId: "", success: false, error: "Provider not initialized" };
@@ -164,10 +171,13 @@ export async function initiateCall(
try {
// For notify mode with a message, use inline TwiML with <Say>.
let inlineTwiml: string | undefined;
let preConnectTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config));
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
} else if (dtmfSequence) {
preConnectTwiml = generateDtmfRedirectTwiml(dtmfSequence, ctx.webhookUrl);
}
const result = await ctx.provider.initiateCall({
@@ -176,6 +186,7 @@ export async function initiateCall(
to,
webhookUrl: ctx.webhookUrl,
inlineTwiml,
preConnectTwiml,
});
callRecord.providerCallId = result.providerCallId;

View File

@@ -7,3 +7,11 @@ export function generateNotifyTwiml(message: string, voice: string): string {
<Hangup/>
</Response>`;
}
export function generateDtmfRedirectTwiml(digits: string, webhookUrl: string): string {
return `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Play digits="${escapeXml(digits)}" />
<Redirect method="POST">${escapeXml(webhookUrl)}</Redirect>
</Response>`;
}

View File

@@ -99,6 +99,41 @@ describe("TwilioProvider", () => {
expectStreamingTwiml(requireResponseBody(result.providerResponseBody));
});
it("serves pre-connect TwiML once before outbound streaming starts", async () => {
const provider = createProvider();
(
provider as unknown as {
apiRequest: TwilioApiRequest;
}
).apiRequest = vi.fn<TwilioApiRequest>(async () => ({
sid: "CA999",
status: "queued",
}));
const preConnectTwiml = '<Response><Play digits="ww123456#" /></Response>';
await provider.initiateCall({
callId: "call-1",
from: "+15550000001",
to: "+15550000002",
webhookUrl: "https://example.ngrok.app/voice/twilio",
preConnectTwiml,
});
const first = provider.parseWebhookEvent(
createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA999", {
callId: "call-1",
}),
);
expect(requireResponseBody(first.providerResponseBody)).toBe(preConnectTwiml);
const second = provider.parseWebhookEvent(
createContext("CallStatus=initiated&Direction=outbound-api&CallSid=CA999", {
callId: "call-1",
}),
);
expectStreamingTwiml(requireResponseBody(second.providerResponseBody));
});
it("returns empty TwiML for status callbacks", () => {
const provider = createProvider();
const ctx = createContext("CallStatus=ringing&Direction=outbound-api", {

View File

@@ -516,8 +516,8 @@ export class TwilioProvider implements VoiceCallProvider {
/**
* Initiate an outbound call via Twilio API.
* If inlineTwiml is provided, uses that directly (for notify mode).
* Otherwise, uses webhook URL for dynamic TwiML.
* If inlineTwiml or preConnectTwiml is provided, the first webhook request
* receives that TwiML before normal dynamic TwiML resumes.
*/
async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
const url = new URL(input.webhookUrl);
@@ -533,6 +533,8 @@ export class TwilioProvider implements VoiceCallProvider {
if (input.inlineTwiml) {
this.twimlStorage.set(input.callId, input.inlineTwiml);
this.notifyCalls.add(input.callId);
} else if (input.preConnectTwiml) {
this.twimlStorage.set(input.callId, input.preConnectTwiml);
}
// Build request params - always use URL-based TwiML.

View File

@@ -214,6 +214,8 @@ export type InitiateCallInput = {
clientState?: Record<string, string>;
/** Inline TwiML to execute (skips webhook, used for notify mode) */
inlineTwiml?: string;
/** TwiML to serve once before normal webhook-driven call handling resumes. */
preConnectTwiml?: string;
};
export type InitiateCallResult = {