diff --git a/CHANGELOG.md b/CHANGELOG.md index 38c73301246..87297cacc40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Voice Call/Twilio: honor stored pre-connect TwiML before realtime webhook shortcuts and reject DTMF sequences outside conversation mode, so Meet PIN entry cannot be skipped or silently dropped. Thanks @donkeykong91 and @PfanP. - Google Meet/Voice Call: play Twilio Meet DTMF before opening the realtime media stream and carry the intro as the initial Voice Call message, so the greeting is generated after Meet admits the phone participant instead of racing a live-call TwiML update. Thanks @donkeykong91 and @PfanP. - Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs before joins. Thanks @donkeykong91 and @PfanP. - Voice Call/Twilio: retry transient 21220 live-call TwiML updates and catch answered-path initial-greeting failures, so a fast answered callback no longer crashes the Gateway or drops the Twilio greeting/listen transition. (#74606) Thanks @Sivan22. diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index c91301a15f7..20067de6c05 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -624,27 +624,31 @@ for turn latency and listen-wait times. Tool name: `voice_call`. -| Action | Args | -| --------------- | ------------------------- | -| `initiate_call` | `message`, `to?`, `mode?` | -| `continue_call` | `callId`, `message` | -| `speak_to_user` | `callId`, `message` | -| `send_dtmf` | `callId`, `digits` | -| `end_call` | `callId` | -| `get_status` | `callId` | +| Action | Args | +| --------------- | ------------------------------------------ | +| `initiate_call` | `message`, `to?`, `mode?`, `dtmfSequence?` | +| `continue_call` | `callId`, `message` | +| `speak_to_user` | `callId`, `message` | +| `send_dtmf` | `callId`, `digits` | +| `end_call` | `callId` | +| `get_status` | `callId` | This repo ships a matching skill doc at `skills/voice-call/SKILL.md`. ## Gateway RPC -| Method | Args | -| -------------------- | ------------------------- | -| `voicecall.initiate` | `to?`, `message`, `mode?` | -| `voicecall.continue` | `callId`, `message` | -| `voicecall.speak` | `callId`, `message` | -| `voicecall.dtmf` | `callId`, `digits` | -| `voicecall.end` | `callId` | -| `voicecall.status` | `callId` | +| Method | Args | +| -------------------- | ------------------------------------------ | +| `voicecall.initiate` | `to?`, `message`, `mode?`, `dtmfSequence?` | +| `voicecall.continue` | `callId`, `message` | +| `voicecall.speak` | `callId`, `message` | +| `voicecall.dtmf` | `callId`, `digits` | +| `voicecall.end` | `callId` | +| `voicecall.status` | `callId` | + +`dtmfSequence` is only valid with `mode: "conversation"`. Notify-mode calls +should use `voicecall.dtmf` after the call exists if they need post-connect +digits. ## Troubleshooting diff --git a/extensions/voice-call/src/manager/outbound.test.ts b/extensions/voice-call/src/manager/outbound.test.ts index 6555ea9f647..dd417ed98e9 100644 --- a/extensions/voice-call/src/manager/outbound.test.ts +++ b/extensions/voice-call/src/manager/outbound.test.ts @@ -218,6 +218,36 @@ describe("voice-call outbound helpers", () => { }); }); + it("rejects DTMF sequences outside conversation mode", async () => { + const initiateProviderCall = vi.fn(async () => ({ providerCallId: "provider-1" })); + const ctx = { + activeCalls: new Map(), + providerCallIdMap: new Map(), + provider: { name: "twilio", initiateCall: initiateProviderCall }, + config: { + maxConcurrentCalls: 3, + outbound: { defaultMode: "notify" }, + fromNumber: "+14155550100", + }, + storePath: "/tmp/voice-call.json", + webhookUrl: "https://example.com/webhook", + }; + + await expect( + initiateCall(ctx as never, "+14155550123", "session-1", { + message: "hello", + dtmfSequence: "123456#", + }), + ).resolves.toEqual({ + callId: "", + success: false, + error: "dtmfSequence requires conversation mode", + }); + + expect(initiateProviderCall).not.toHaveBeenCalled(); + expect(ctx.activeCalls.size).toBe(0); + }); + it("fails initiateCall cleanly when provider initiation throws", async () => { const ctx = { activeCalls: new Map(), diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts index 61841f28d1a..79835a2428a 100644 --- a/extensions/voice-call/src/manager/outbound.ts +++ b/extensions/voice-call/src/manager/outbound.ts @@ -124,6 +124,13 @@ export async function initiateCall( if (validationError) { return { callId: "", success: false, error: validationError }; } + if (mode !== "conversation") { + return { + callId: "", + success: false, + error: "dtmfSequence requires conversation mode", + }; + } } if (!ctx.provider) { diff --git a/extensions/voice-call/src/providers/base.ts b/extensions/voice-call/src/providers/base.ts index 8319cc8e3fc..fa7acac4bde 100644 --- a/extensions/voice-call/src/providers/base.ts +++ b/extensions/voice-call/src/providers/base.ts @@ -43,6 +43,12 @@ export interface VoiceCallProvider { */ parseWebhookEvent(ctx: WebhookContext, options?: WebhookParseOptions): ProviderWebhookParseResult; + /** + * Consume one-time TwiML that must be served before shortcut handlers such as + * realtime media streams take over the webhook response. + */ + consumeInitialTwiML?: (ctx: WebhookContext) => string | null; + /** * Initiate an outbound call. * @returns Provider call ID and status diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index 3bbdee45a04..8c5c9460988 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -443,6 +443,19 @@ export class TwilioProvider implements VoiceCallProvider { } } + consumeInitialTwiML(ctx: WebhookContext): string | null { + const view = readTwimlRequestView(ctx); + if (!view.callIdFromQuery || view.isStatusCallback) { + return null; + } + const storedTwiml = this.twimlStorage.get(view.callIdFromQuery); + if (!storedTwiml) { + return null; + } + this.deleteStoredTwiml(view.callIdFromQuery); + return storedTwiml; + } + /** * Get the WebSocket URL for media streaming. * Derives from the public URL origin + stream path. diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index 67c50c0cf19..1e321d32844 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -679,6 +679,71 @@ describe("VoiceCallWebhookServer replay handling", () => { }, ); + it("serves initial provider TwiML before the realtime shortcut", async () => { + const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 })); + const consumeInitialTwiML = vi.fn( + () => + 'https://example.test', + ); + const buildTwiMLPayload = vi.fn(() => ({ + statusCode: 200, + headers: { "Content-Type": "text/xml" }, + body: '', + })); + const twilioProvider: VoiceCallProvider = { + ...provider, + name: "twilio", + verifyWebhook: () => ({ ok: true, verifiedRequestKey: "twilio:req:rt-stored" }), + parseWebhookEvent, + consumeInitialTwiML, + }; + const { manager, processEvent } = createManager([]); + const config = createConfig({ + provider: "twilio", + inboundPolicy: "disabled", + realtime: { + enabled: true, + streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + toolPolicy: "safe-read-only", + tools: [], + providers: {}, + }, + }); + const server = new VoiceCallWebhookServer(config, manager, twilioProvider); + server.setRealtimeHandler({ + buildTwiMLPayload, + getStreamPathPattern: () => "/voice/stream/realtime", + handleWebSocketUpgrade: () => {}, + registerToolHandler: () => {}, + setPublicUrl: () => {}, + } as unknown as RealtimeCallHandler); + + try { + const baseUrl = await server.start(); + const requestUrl = requireBoundRequestUrl(server, baseUrl); + requestUrl.searchParams.set("callId", "call-1"); + const response = await fetch(requestUrl.toString(), { + method: "POST", + headers: { + "content-type": "application/x-www-form-urlencoded", + "x-twilio-signature": "sig", + }, + body: "CallSid=CA123&Direction=outbound-api&CallStatus=in-progress&From=%2B15550001111&To=%2B15550002222", + }); + + expect(response.status).toBe(200); + const body = await response.text(); + expect(body).toContain(' { const buildTwiMLPayload = vi.fn(() => ({ statusCode: 200, diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index f7e40987bb8..b9c97069d3d 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -672,6 +672,15 @@ export class VoiceCallWebhookServer { return { statusCode: 401, body: "Unauthorized" }; } + const initialTwiML = this.provider.consumeInitialTwiML?.(ctx); + if (initialTwiML !== undefined && initialTwiML !== null) { + return { + statusCode: 200, + headers: { "Content-Type": "application/xml" }, + body: initialTwiML, + }; + } + const realtimeParams = this.getRealtimeTwimlParams(ctx); if (realtimeParams) { const direction = realtimeParams.get("Direction");