From 8a9d02dd82d49026aae99f8b64ff75aa80932874 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 22:35:26 +0100 Subject: [PATCH] fix(voice-call): keep outbound realtime streams attached (#71266) Fixes outbound Twilio realtime conversations so the TwiML fetch returns the realtime path for outbound directions and the answered-call path does not overwrite it with legacy TwiML. Local proof: - pnpm test extensions/voice-call/src/manager.notify.test.ts extensions/voice-call/src/webhook.test.ts - pnpm check:changed - pnpm check - pnpm build - local VoiceCallWebhookServer + CallManager smoke for Direction=outbound-api Closes #68713. --- docs/plugins/voice-call.md | 1 + .../voice-call/src/manager.notify.test.ts | 32 +++++++++++ extensions/voice-call/src/manager.ts | 3 + extensions/voice-call/src/webhook.test.ts | 55 +++++++++++++++++++ extensions/voice-call/src/webhook.ts | 9 ++- 5 files changed, 97 insertions(+), 3 deletions(-) diff --git a/docs/plugins/voice-call.md b/docs/plugins/voice-call.md index dc686b5137e..35a6b7514cd 100644 --- a/docs/plugins/voice-call.md +++ b/docs/plugins/voice-call.md @@ -548,6 +548,7 @@ For outbound `conversation` calls, first-message handling is tied to live playba - Barge-in queue clear and auto-response are suppressed only while the initial greeting is actively speaking. - If initial playback fails, the call returns to `listening` and the initial message remains queued for retry. - Initial playback for Twilio streaming starts on stream connect without extra delay. +- Realtime voice conversations use the realtime stream's own opening turn. Voice Call does not post a legacy `` TwiML update for that initial message, so outbound `` sessions stay attached. ### Twilio stream disconnect grace diff --git a/extensions/voice-call/src/manager.notify.test.ts b/extensions/voice-call/src/manager.notify.test.ts index 5c4dbe18d92..14faacb11d4 100644 --- a/extensions/voice-call/src/manager.notify.test.ts +++ b/extensions/voice-call/src/manager.notify.test.ts @@ -177,6 +177,38 @@ describe("CallManager notify and mapping", () => { expectFirstPlayTtsText(provider, "Twilio non-stream"); }); + it("lets realtime conversations own the initial greeting instead of posting legacy TwiML", async () => { + const { manager, provider } = await createManagerHarness( + { realtime: { enabled: true, provider: "openai" } }, + new FakeProvider("twilio"), + ); + + const callId = await initiateCallWithMessage( + manager, + "+15550000010", + "Tell Nana dinner is at 6pm.", + "conversation", + ); + await answerCall(manager, callId, "evt-conversation-twilio-realtime"); + + expect(provider.playTtsCalls).toHaveLength(0); + expect(requireCall(manager, callId).metadata).toEqual( + expect.objectContaining({ initialMessage: "Tell Nana dinner is at 6pm." }), + ); + }); + + it("still speaks initial message in notify mode when realtime is enabled", async () => { + const { manager, provider } = await createManagerHarness( + { realtime: { enabled: true, provider: "openai" } }, + new FakeProvider("twilio"), + ); + + const callId = await initiateCallWithMessage(manager, "+15550000011", "Notify text", "notify"); + await answerCall(manager, callId, "evt-notify-twilio-realtime"); + + expectFirstPlayTtsText(provider, "Notify text"); + }); + it("waits for stream connect in conversation mode when Twilio streaming is enabled", async () => { const { manager, provider } = await createManagerHarness( { streaming: { enabled: true } }, diff --git a/extensions/voice-call/src/manager.ts b/extensions/voice-call/src/manager.ts index 4db45779c75..33ff00abdcc 100644 --- a/extensions/voice-call/src/manager.ts +++ b/extensions/voice-call/src/manager.ts @@ -307,6 +307,9 @@ export class CallManager { // is actually available; otherwise speak immediately on answered. const mode = (call.metadata?.mode as string | undefined) ?? "conversation"; if (mode === "conversation") { + if (this.config.realtime.enabled) { + return; + } const shouldWaitForStreamConnect = this.shouldDeferConversationInitialMessageUntilStreamConnect(); if (shouldWaitForStreamConnect) { diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index 6e95b9e6dc4..e97a63786af 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -606,6 +606,61 @@ describe("VoiceCallWebhookServer replay handling", () => { } }); + it.each(["outbound-api", "outbound-dial"] as const)( + "returns realtime TwiML for %s twilio TwiML fetches", + async (direction) => { + const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 })); + const buildTwiMLPayload = vi.fn(() => ({ + statusCode: 200, + headers: { "Content-Type": "text/xml" }, + body: '', + })); + const twilioProvider: VoiceCallProvider = { + ...provider, + name: "twilio", + verifyWebhook: () => ({ ok: true, verifiedRequestKey: "twilio:req:rt-outbound" }), + parseWebhookEvent, + }; + const { manager, processEvent } = createManager([]); + const config = createConfig({ + provider: "twilio", + inboundPolicy: "disabled", + realtime: { + enabled: true, + streamPath: "/voice/stream/realtime", + tools: [], + providers: {}, + }, + }); + const server = new VoiceCallWebhookServer(config, manager, twilioProvider); + server.setRealtimeHandler({ + buildTwiMLPayload, + getStreamPathPattern: () => "/voice/stream/realtime", + handleWebSocketUpgrade: () => {}, + registerToolHandler: () => {}, + setPublicUrl: () => {}, + } as unknown as RealtimeCallHandler); + + try { + const baseUrl = await server.start(); + const response = await postWebhookFormWithHeaders( + server, + baseUrl, + `CallSid=CA123&Direction=${direction}&CallStatus=in-progress&From=%2B15550001111&To=%2B15550002222`, + { "x-twilio-signature": "sig" }, + ); + + expect(response.status).toBe(200); + expect(await response.text()).toContain(" { const buildTwiMLPayload = vi.fn(() => ({ statusCode: 200, diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index f611e54226c..8b35dc93589 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -643,7 +643,9 @@ export class VoiceCallWebhookServer { const realtimeParams = this.getRealtimeTwimlParams(ctx); if (realtimeParams) { - if (!this.shouldAcceptRealtimeInboundRequest(realtimeParams)) { + const direction = realtimeParams.get("Direction"); + const isInboundRealtimeRequest = !direction || direction === "inbound"; + if (isInboundRealtimeRequest && !this.shouldAcceptRealtimeInboundRequest(realtimeParams)) { console.log("[voice-call] Realtime inbound call rejected before stream setup"); return buildRealtimeRejectedTwiML(); } @@ -718,8 +720,9 @@ export class VoiceCallWebhookServer { const params = new URLSearchParams(ctx.rawBody); const direction = params.get("Direction"); - const isInbound = !direction || direction === "inbound"; - if (!isInbound) { + const isSupportedDirection = + !direction || direction === "inbound" || direction.startsWith("outbound"); + if (!isSupportedDirection) { return null; }