fix(voice-call): keep outbound realtime streams attached (#71266)

Fixes outbound Twilio realtime conversations so the TwiML fetch returns the realtime <Connect><Stream> path for outbound directions and the answered-call path does not overwrite it with legacy <Say> TwiML.

Local proof:
- pnpm test extensions/voice-call/src/manager.notify.test.ts extensions/voice-call/src/webhook.test.ts
- pnpm check:changed
- pnpm check
- pnpm build
- local VoiceCallWebhookServer + CallManager smoke for Direction=outbound-api

Closes #68713.
This commit is contained in:
Peter Steinberger
2026-04-24 22:35:26 +01:00
committed by GitHub
parent 5b8bd6371c
commit 8a9d02dd82
5 changed files with 97 additions and 3 deletions

View File

@@ -548,6 +548,7 @@ For outbound `conversation` calls, first-message handling is tied to live playba
- Barge-in queue clear and auto-response are suppressed only while the initial greeting is actively speaking.
- If initial playback fails, the call returns to `listening` and the initial message remains queued for retry.
- Initial playback for Twilio streaming starts on stream connect without extra delay.
- Realtime voice conversations use the realtime stream's own opening turn. Voice Call does not post a legacy `<Say>` TwiML update for that initial message, so outbound `<Connect><Stream>` sessions stay attached.
### Twilio stream disconnect grace

View File

@@ -177,6 +177,38 @@ describe("CallManager notify and mapping", () => {
expectFirstPlayTtsText(provider, "Twilio non-stream");
});
it("lets realtime conversations own the initial greeting instead of posting legacy TwiML", async () => {
const { manager, provider } = await createManagerHarness(
{ realtime: { enabled: true, provider: "openai" } },
new FakeProvider("twilio"),
);
const callId = await initiateCallWithMessage(
manager,
"+15550000010",
"Tell Nana dinner is at 6pm.",
"conversation",
);
await answerCall(manager, callId, "evt-conversation-twilio-realtime");
expect(provider.playTtsCalls).toHaveLength(0);
expect(requireCall(manager, callId).metadata).toEqual(
expect.objectContaining({ initialMessage: "Tell Nana dinner is at 6pm." }),
);
});
it("still speaks initial message in notify mode when realtime is enabled", async () => {
const { manager, provider } = await createManagerHarness(
{ realtime: { enabled: true, provider: "openai" } },
new FakeProvider("twilio"),
);
const callId = await initiateCallWithMessage(manager, "+15550000011", "Notify text", "notify");
await answerCall(manager, callId, "evt-notify-twilio-realtime");
expectFirstPlayTtsText(provider, "Notify text");
});
it("waits for stream connect in conversation mode when Twilio streaming is enabled", async () => {
const { manager, provider } = await createManagerHarness(
{ streaming: { enabled: true } },

View File

@@ -307,6 +307,9 @@ export class CallManager {
// is actually available; otherwise speak immediately on answered.
const mode = (call.metadata?.mode as string | undefined) ?? "conversation";
if (mode === "conversation") {
if (this.config.realtime.enabled) {
return;
}
const shouldWaitForStreamConnect =
this.shouldDeferConversationInitialMessageUntilStreamConnect();
if (shouldWaitForStreamConnect) {

View File

@@ -606,6 +606,61 @@ describe("VoiceCallWebhookServer replay handling", () => {
}
});
it.each(["outbound-api", "outbound-dial"] as const)(
"returns realtime TwiML for %s twilio TwiML fetches",
async (direction) => {
const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 }));
const buildTwiMLPayload = vi.fn(() => ({
statusCode: 200,
headers: { "Content-Type": "text/xml" },
body: '<Response><Connect><Stream url="wss://example.test/voice/stream/realtime/token" /></Connect></Response>',
}));
const twilioProvider: VoiceCallProvider = {
...provider,
name: "twilio",
verifyWebhook: () => ({ ok: true, verifiedRequestKey: "twilio:req:rt-outbound" }),
parseWebhookEvent,
};
const { manager, processEvent } = createManager([]);
const config = createConfig({
provider: "twilio",
inboundPolicy: "disabled",
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
tools: [],
providers: {},
},
});
const server = new VoiceCallWebhookServer(config, manager, twilioProvider);
server.setRealtimeHandler({
buildTwiMLPayload,
getStreamPathPattern: () => "/voice/stream/realtime",
handleWebSocketUpgrade: () => {},
registerToolHandler: () => {},
setPublicUrl: () => {},
} as unknown as RealtimeCallHandler);
try {
const baseUrl = await server.start();
const response = await postWebhookFormWithHeaders(
server,
baseUrl,
`CallSid=CA123&Direction=${direction}&CallStatus=in-progress&From=%2B15550001111&To=%2B15550002222`,
{ "x-twilio-signature": "sig" },
);
expect(response.status).toBe(200);
expect(await response.text()).toContain("<Connect><Stream");
expect(buildTwiMLPayload).toHaveBeenCalledTimes(1);
expect(parseWebhookEvent).not.toHaveBeenCalled();
expect(processEvent).not.toHaveBeenCalled();
} finally {
await server.stop();
}
},
);
it("rejects non-allowlisted inbound realtime calls before creating a stream token", async () => {
const buildTwiMLPayload = vi.fn(() => ({
statusCode: 200,

View File

@@ -643,7 +643,9 @@ export class VoiceCallWebhookServer {
const realtimeParams = this.getRealtimeTwimlParams(ctx);
if (realtimeParams) {
if (!this.shouldAcceptRealtimeInboundRequest(realtimeParams)) {
const direction = realtimeParams.get("Direction");
const isInboundRealtimeRequest = !direction || direction === "inbound";
if (isInboundRealtimeRequest && !this.shouldAcceptRealtimeInboundRequest(realtimeParams)) {
console.log("[voice-call] Realtime inbound call rejected before stream setup");
return buildRealtimeRejectedTwiML();
}
@@ -718,8 +720,9 @@ export class VoiceCallWebhookServer {
const params = new URLSearchParams(ctx.rawBody);
const direction = params.get("Direction");
const isInbound = !direction || direction === "inbound";
if (!isInbound) {
const isSupportedDirection =
!direction || direction === "inbound" || direction.startsWith("outbound");
if (!isSupportedDirection) {
return null;
}