mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix: delay meet twilio intro speech
This commit is contained in:
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Google Meet/Voice Call: defer Twilio dial-in intro speech until after Meet DTMF entry and route delayed speech through the active realtime Voice Call bridge. Thanks @donkeykong91 and @PfanP.
|
||||
- Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs before joins. Thanks @donkeykong91 and @PfanP.
|
||||
- Voice Call/Twilio: retry transient 21220 live-call TwiML updates and catch answered-path initial-greeting failures, so a fast answered callback no longer crashes the Gateway or drops the Twilio greeting/listen transition. (#74606) Thanks @Sivan22.
|
||||
- Voice Call/Twilio: register accepted media streams immediately but wait for realtime transcription readiness before speaking the initial greeting, so reconnect grace handling stays live while OpenAI STT startup is no longer starved by TTS. Fixes #75197. (#75257) Thanks @donkeykong91 and @PfanP.
|
||||
|
||||
@@ -1411,6 +1411,9 @@ participant:
|
||||
the PIN.
|
||||
- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for
|
||||
example `wwww123456#`.
|
||||
- If the participant joins but you miss the first spoken line, increase
|
||||
`plugins.entries.google-meet.config.voiceCall.postDtmfSpeechDelayMs` so the
|
||||
intro is spoken after Meet finishes admitting the phone participant.
|
||||
|
||||
If webhooks do not arrive, debug the Voice Call plugin first: the provider must
|
||||
reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel.
|
||||
|
||||
@@ -766,6 +766,11 @@ If Voice Call is green but the Meet participant never joins, check the Meet
|
||||
dial-in number, PIN, and `--dtmf-sequence`. The phone call can be healthy while
|
||||
the meeting rejects or ignores an incorrect DTMF sequence.
|
||||
|
||||
Google Meet starts Voice Call silently, sends DTMF, then asks Voice Call to
|
||||
speak the intro after `voiceCall.postDtmfSpeechDelayMs`. Increase that delay in
|
||||
the Google Meet plugin config if the first line is spoken before Meet admits the
|
||||
phone participant.
|
||||
|
||||
### Realtime call has no speech
|
||||
|
||||
Confirm only one audio mode is enabled. `realtime.enabled` and
|
||||
|
||||
@@ -12,8 +12,13 @@ import {
|
||||
import { CREATE_MEET_FROM_BROWSER_SCRIPT } from "./src/transports/chrome-create.js";
|
||||
|
||||
const voiceCallMocks = vi.hoisted(() => ({
|
||||
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
|
||||
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
|
||||
callId: "call-1",
|
||||
dtmfSent: true,
|
||||
introSent: true,
|
||||
})),
|
||||
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
|
||||
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
|
||||
}));
|
||||
|
||||
const fetchGuardMocks = vi.hoisted(() => ({
|
||||
@@ -38,6 +43,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
|
||||
vi.mock("./src/voice-call-gateway.js", () => ({
|
||||
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
|
||||
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
|
||||
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
|
||||
}));
|
||||
|
||||
function setup(
|
||||
|
||||
@@ -35,8 +35,13 @@ import { buildMeetDtmfSequence, normalizeDialInNumber } from "./src/transports/t
|
||||
import type { GoogleMeetSession } from "./src/transports/types.js";
|
||||
|
||||
const voiceCallMocks = vi.hoisted(() => ({
|
||||
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
|
||||
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
|
||||
callId: "call-1",
|
||||
dtmfSent: true,
|
||||
introSent: true,
|
||||
})),
|
||||
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
|
||||
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
|
||||
}));
|
||||
|
||||
const fetchGuardMocks = vi.hoisted(() => ({
|
||||
@@ -61,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
|
||||
vi.mock("./src/voice-call-gateway.js", () => ({
|
||||
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
|
||||
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
|
||||
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
|
||||
}));
|
||||
|
||||
function setup(
|
||||
@@ -348,7 +354,12 @@ describe("google-meet plugin", () => {
|
||||
"BlackHole 2ch",
|
||||
],
|
||||
},
|
||||
voiceCall: { enabled: true, requestTimeoutMs: 30000, dtmfDelayMs: 2500 },
|
||||
voiceCall: {
|
||||
enabled: true,
|
||||
requestTimeoutMs: 30000,
|
||||
dtmfDelayMs: 2500,
|
||||
postDtmfSpeechDelayMs: 5000,
|
||||
},
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
introMessage: "Say exactly: I'm here and listening.",
|
||||
@@ -955,12 +966,14 @@ describe("google-meet plugin", () => {
|
||||
dtmfSequence: "123456#",
|
||||
voiceCallId: "call-1",
|
||||
dtmfSent: true,
|
||||
introSent: true,
|
||||
},
|
||||
});
|
||||
expect(voiceCallMocks.joinMeetViaVoiceCallGateway).toHaveBeenCalledWith({
|
||||
config: expect.objectContaining({ defaultTransport: "twilio" }),
|
||||
dialInNumber: "+15551234567",
|
||||
dtmfSequence: "123456#",
|
||||
message: "Say exactly: I'm here and listening.",
|
||||
});
|
||||
});
|
||||
|
||||
@@ -984,6 +997,32 @@ describe("google-meet plugin", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("delegates Twilio session speech through voice-call", async () => {
|
||||
const { tools } = setup({ defaultTransport: "twilio" });
|
||||
const tool = tools[0] as {
|
||||
execute: (id: string, params: unknown) => Promise<{ details: { session: { id: string } } }>;
|
||||
};
|
||||
const joined = await tool.execute("id", {
|
||||
action: "join",
|
||||
url: "https://meet.google.com/abc-defg-hij",
|
||||
dialInNumber: "+15551234567",
|
||||
pin: "123456",
|
||||
});
|
||||
|
||||
const spoken = await tool.execute("id", {
|
||||
action: "speak",
|
||||
sessionId: joined.details.session.id,
|
||||
message: "Say exactly: hello after joining.",
|
||||
});
|
||||
|
||||
expect(spoken.details).toMatchObject({ spoken: true });
|
||||
expect(voiceCallMocks.speakMeetViaVoiceCallGateway).toHaveBeenCalledWith({
|
||||
config: expect.objectContaining({ defaultTransport: "twilio" }),
|
||||
callId: "call-1",
|
||||
message: "Say exactly: hello after joining.",
|
||||
});
|
||||
});
|
||||
|
||||
it("reports setup status through the tool", async () => {
|
||||
const originalPlatform = process.platform;
|
||||
Object.defineProperty(process, "platform", { value: "darwin" });
|
||||
|
||||
@@ -119,6 +119,10 @@ const googleMeetConfigSchema = {
|
||||
advanced: true,
|
||||
},
|
||||
"voiceCall.dtmfDelayMs": { label: "DTMF Delay (ms)", advanced: true },
|
||||
"voiceCall.postDtmfSpeechDelayMs": {
|
||||
label: "Post-DTMF Speech Delay (ms)",
|
||||
advanced: true,
|
||||
},
|
||||
"voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true },
|
||||
"realtime.provider": {
|
||||
label: "Realtime Provider",
|
||||
|
||||
@@ -52,6 +52,7 @@ export type GoogleMeetConfig = {
|
||||
token?: string;
|
||||
requestTimeoutMs: number;
|
||||
dtmfDelayMs: number;
|
||||
postDtmfSpeechDelayMs: number;
|
||||
introMessage?: string;
|
||||
};
|
||||
realtime: {
|
||||
@@ -181,6 +182,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
|
||||
enabled: true,
|
||||
requestTimeoutMs: 30_000,
|
||||
dtmfDelayMs: 2_500,
|
||||
postDtmfSpeechDelayMs: 5_000,
|
||||
},
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
@@ -432,6 +434,10 @@ export function resolveGoogleMeetConfigWithEnv(
|
||||
voiceCall.dtmfDelayMs,
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.dtmfDelayMs,
|
||||
),
|
||||
postDtmfSpeechDelayMs: resolveNumber(
|
||||
voiceCall.postDtmfSpeechDelayMs,
|
||||
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.postDtmfSpeechDelayMs,
|
||||
),
|
||||
introMessage: normalizeOptionalString(voiceCall.introMessage),
|
||||
},
|
||||
realtime: {
|
||||
|
||||
@@ -21,7 +21,11 @@ import type {
|
||||
GoogleMeetJoinResult,
|
||||
GoogleMeetSession,
|
||||
} from "./transports/types.js";
|
||||
import { endMeetVoiceCallGatewayCall, joinMeetViaVoiceCallGateway } from "./voice-call-gateway.js";
|
||||
import {
|
||||
endMeetVoiceCallGatewayCall,
|
||||
joinMeetViaVoiceCallGateway,
|
||||
speakMeetViaVoiceCallGateway,
|
||||
} from "./voice-call-gateway.js";
|
||||
|
||||
function nowIso(): string {
|
||||
return new Date().toISOString();
|
||||
@@ -301,6 +305,7 @@ export class GoogleMeetRuntime {
|
||||
return { session: reusable, spoken };
|
||||
}
|
||||
const createdAt = nowIso();
|
||||
let delegatedTwilioSpoken = false;
|
||||
|
||||
const session: GoogleMeetSession = {
|
||||
id: `meet_${randomUUID()}`,
|
||||
@@ -398,14 +403,22 @@ export class GoogleMeetRuntime {
|
||||
config: this.params.config,
|
||||
dialInNumber,
|
||||
dtmfSequence,
|
||||
message:
|
||||
mode === "realtime"
|
||||
? (request.message ??
|
||||
this.params.config.voiceCall.introMessage ??
|
||||
this.params.config.realtime.introMessage)
|
||||
: undefined,
|
||||
})
|
||||
: undefined;
|
||||
delegatedTwilioSpoken = Boolean(voiceCallResult?.introSent);
|
||||
session.twilio = {
|
||||
dialInNumber,
|
||||
pinProvided: Boolean(request.pin ?? this.params.config.twilio.defaultPin),
|
||||
dtmfSequence,
|
||||
voiceCallId: voiceCallResult?.callId,
|
||||
dtmfSent: voiceCallResult?.dtmfSent,
|
||||
introSent: voiceCallResult?.introSent,
|
||||
};
|
||||
if (voiceCallResult?.callId) {
|
||||
this.#sessionStops.set(session.id, async () => {
|
||||
@@ -428,9 +441,11 @@ export class GoogleMeetRuntime {
|
||||
|
||||
this.#sessions.set(session.id, session);
|
||||
const spoken =
|
||||
mode === "realtime" && speechInstructions
|
||||
? (await this.speak(session.id, speechInstructions)).spoken
|
||||
: false;
|
||||
transport === "twilio"
|
||||
? delegatedTwilioSpoken
|
||||
: mode === "realtime" && speechInstructions
|
||||
? (await this.speak(session.id, speechInstructions)).spoken
|
||||
: false;
|
||||
return { session, spoken };
|
||||
}
|
||||
|
||||
@@ -459,6 +474,20 @@ export class GoogleMeetRuntime {
|
||||
if (!session) {
|
||||
return { found: false, spoken: false };
|
||||
}
|
||||
if (session.transport === "twilio" && session.twilio?.voiceCallId) {
|
||||
await speakMeetViaVoiceCallGateway({
|
||||
config: this.params.config,
|
||||
callId: session.twilio.voiceCallId,
|
||||
message:
|
||||
instructions ||
|
||||
this.params.config.voiceCall.introMessage ||
|
||||
this.params.config.realtime.introMessage ||
|
||||
"",
|
||||
});
|
||||
session.twilio.introSent = true;
|
||||
session.updatedAt = nowIso();
|
||||
return { found: true, spoken: true, session };
|
||||
}
|
||||
await this.#refreshBrowserHealthForChromeSession(session);
|
||||
const speak = this.#sessionSpeakers.get(sessionId);
|
||||
if (!speak || session.state !== "active") {
|
||||
|
||||
@@ -86,6 +86,7 @@ export type GoogleMeetSession = {
|
||||
dtmfSequence?: string;
|
||||
voiceCallId?: string;
|
||||
dtmfSent?: boolean;
|
||||
introSent?: boolean;
|
||||
};
|
||||
notes: string[];
|
||||
};
|
||||
|
||||
@@ -27,25 +27,49 @@ describe("Google Meet voice-call gateway", () => {
|
||||
gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
|
||||
});
|
||||
|
||||
it("starts Twilio Meet calls in conversation mode with the realtime intro by default", async () => {
|
||||
it("starts Twilio Meet calls silently, sends DTMF, then speaks the realtime intro", async () => {
|
||||
const config = resolveGoogleMeetConfig({
|
||||
voiceCall: { gatewayUrl: "ws://127.0.0.1:18789" },
|
||||
voiceCall: {
|
||||
gatewayUrl: "ws://127.0.0.1:18789",
|
||||
dtmfDelayMs: 1,
|
||||
postDtmfSpeechDelayMs: 1,
|
||||
},
|
||||
realtime: { introMessage: "Say exactly: I'm here and listening." },
|
||||
});
|
||||
|
||||
await joinMeetViaVoiceCallGateway({
|
||||
config,
|
||||
dialInNumber: "+15551234567",
|
||||
dtmfSequence: "123456#",
|
||||
message: "Say exactly: I'm here and listening.",
|
||||
});
|
||||
|
||||
expect(gatewayMocks.request).toHaveBeenCalledWith(
|
||||
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
"voicecall.start",
|
||||
{
|
||||
to: "+15551234567",
|
||||
message: "Say exactly: I'm here and listening.",
|
||||
mode: "conversation",
|
||||
},
|
||||
{ timeoutMs: 30_000 },
|
||||
);
|
||||
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
"voicecall.dtmf",
|
||||
{
|
||||
callId: "call-1",
|
||||
digits: "123456#",
|
||||
},
|
||||
{ timeoutMs: 30_000 },
|
||||
);
|
||||
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
|
||||
3,
|
||||
"voicecall.speak",
|
||||
{
|
||||
callId: "call-1",
|
||||
message: "Say exactly: I'm here and listening.",
|
||||
},
|
||||
{ timeoutMs: 30_000 },
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -13,9 +13,15 @@ type VoiceCallStartResult = {
|
||||
error?: string;
|
||||
};
|
||||
|
||||
type VoiceCallSpeakResult = {
|
||||
success?: boolean;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type VoiceCallMeetJoinResult = {
|
||||
callId: string;
|
||||
dtmfSent: boolean;
|
||||
introSent: boolean;
|
||||
};
|
||||
|
||||
async function createConnectedGatewayClient(
|
||||
@@ -67,6 +73,7 @@ export async function joinMeetViaVoiceCallGateway(params: {
|
||||
config: GoogleMeetConfig;
|
||||
dialInNumber: string;
|
||||
dtmfSequence?: string;
|
||||
message?: string;
|
||||
}): Promise<VoiceCallMeetJoinResult> {
|
||||
let client: VoiceCallGatewayClient | undefined;
|
||||
|
||||
@@ -76,7 +83,6 @@ export async function joinMeetViaVoiceCallGateway(params: {
|
||||
"voicecall.start",
|
||||
{
|
||||
to: params.dialInNumber,
|
||||
message: params.config.voiceCall.introMessage ?? params.config.realtime.introMessage,
|
||||
mode: "conversation",
|
||||
},
|
||||
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
|
||||
@@ -95,7 +101,25 @@ export async function joinMeetViaVoiceCallGateway(params: {
|
||||
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
|
||||
);
|
||||
}
|
||||
return { callId: start.callId, dtmfSent: Boolean(params.dtmfSequence) };
|
||||
if (params.message) {
|
||||
await sleep(params.config.voiceCall.postDtmfSpeechDelayMs);
|
||||
const spoken = (await client.request(
|
||||
"voicecall.speak",
|
||||
{
|
||||
callId: start.callId,
|
||||
message: params.message,
|
||||
},
|
||||
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
|
||||
)) as VoiceCallSpeakResult;
|
||||
if (spoken.success === false) {
|
||||
throw new Error(spoken.error || "voicecall.speak failed");
|
||||
}
|
||||
}
|
||||
return {
|
||||
callId: start.callId,
|
||||
dtmfSent: Boolean(params.dtmfSequence),
|
||||
introSent: Boolean(params.message),
|
||||
};
|
||||
} finally {
|
||||
await client?.stopAndWait({ timeoutMs: 1_000 });
|
||||
}
|
||||
@@ -120,3 +144,28 @@ export async function endMeetVoiceCallGatewayCall(params: {
|
||||
await client?.stopAndWait({ timeoutMs: 1_000 });
|
||||
}
|
||||
}
|
||||
|
||||
export async function speakMeetViaVoiceCallGateway(params: {
|
||||
config: GoogleMeetConfig;
|
||||
callId: string;
|
||||
message: string;
|
||||
}): Promise<void> {
|
||||
let client: VoiceCallGatewayClient | undefined;
|
||||
|
||||
try {
|
||||
client = await createConnectedGatewayClient(params.config);
|
||||
const spoken = (await client.request(
|
||||
"voicecall.speak",
|
||||
{
|
||||
callId: params.callId,
|
||||
message: params.message,
|
||||
},
|
||||
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
|
||||
)) as VoiceCallSpeakResult;
|
||||
if (spoken.success === false) {
|
||||
throw new Error(spoken.error || "voicecall.speak failed");
|
||||
}
|
||||
} finally {
|
||||
await client?.stopAndWait({ timeoutMs: 1_000 });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -369,12 +369,27 @@ export default definePluginEntry({
|
||||
"voicecall.speak",
|
||||
async ({ params, respond }: GatewayRequestHandlerOptions) => {
|
||||
try {
|
||||
await respondToCallMessageAction({
|
||||
requestParams: params,
|
||||
respond,
|
||||
action: (request) => request.rt.manager.speak(request.callId, request.message),
|
||||
failure: "speak failed",
|
||||
});
|
||||
const request = await resolveCallMessageRequest(params);
|
||||
if ("error" in request) {
|
||||
respond(false, { error: request.error });
|
||||
return;
|
||||
}
|
||||
if (request.rt.config.realtime.enabled) {
|
||||
const realtimeResult = request.rt.webhookServer.speakRealtime(
|
||||
request.callId,
|
||||
request.message,
|
||||
);
|
||||
if (realtimeResult.success) {
|
||||
respond(true, { success: true });
|
||||
return;
|
||||
}
|
||||
}
|
||||
const result = await request.rt.manager.speak(request.callId, request.message);
|
||||
if (!result.success) {
|
||||
respond(false, { error: result.error || "speak failed" });
|
||||
return;
|
||||
}
|
||||
respond(true, { success: true });
|
||||
} catch (err) {
|
||||
sendError(respond, err);
|
||||
}
|
||||
|
||||
@@ -195,6 +195,13 @@ export class VoiceCallWebhookServer {
|
||||
return this.realtimeHandler;
|
||||
}
|
||||
|
||||
speakRealtime(callId: string, instructions: string): { success: boolean; error?: string } {
|
||||
if (!this.realtimeHandler) {
|
||||
return { success: false, error: "Realtime voice handler is not configured" };
|
||||
}
|
||||
return this.realtimeHandler.speak(callId, instructions);
|
||||
}
|
||||
|
||||
setRealtimeHandler(handler: RealtimeCallHandler): void {
|
||||
this.realtimeHandler = handler;
|
||||
}
|
||||
|
||||
@@ -214,6 +214,121 @@ describe("RealtimeCallHandler path routing", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("does not emit an outbound realtime greeting without an initial message", async () => {
|
||||
let callbacks:
|
||||
| {
|
||||
onReady?: () => void;
|
||||
}
|
||||
| undefined;
|
||||
const triggerGreeting = vi.fn();
|
||||
const createBridge = vi.fn(
|
||||
(request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0]) => {
|
||||
callbacks = request;
|
||||
return makeBridge({ triggerGreeting });
|
||||
},
|
||||
);
|
||||
const getCallByProviderCallId = vi.fn(
|
||||
(): CallRecord => ({
|
||||
callId: "call-1",
|
||||
providerCallId: "CA-silent",
|
||||
provider: "twilio",
|
||||
direction: "outbound",
|
||||
state: "ringing",
|
||||
from: "+15550001234",
|
||||
to: "+15550009999",
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {},
|
||||
}),
|
||||
);
|
||||
const handler = makeHandler(undefined, {
|
||||
manager: {
|
||||
getCallByProviderCallId,
|
||||
},
|
||||
realtimeProvider: makeRealtimeProvider(createBridge),
|
||||
});
|
||||
const server = await startRealtimeServer(handler);
|
||||
|
||||
try {
|
||||
const ws = await connectWs(server.url);
|
||||
try {
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
event: "start",
|
||||
start: { streamSid: "MZ-silent", callSid: "CA-silent" },
|
||||
}),
|
||||
);
|
||||
await vi.waitFor(() => {
|
||||
expect(createBridge).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
callbacks?.onReady?.();
|
||||
|
||||
expect(triggerGreeting).not.toHaveBeenCalled();
|
||||
} finally {
|
||||
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
|
||||
ws.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it("speaks through the active outbound realtime bridge by call id", async () => {
|
||||
const triggerGreeting = vi.fn();
|
||||
const createBridge = vi.fn(() => makeBridge({ triggerGreeting }));
|
||||
const getCallByProviderCallId = vi.fn(
|
||||
(): CallRecord => ({
|
||||
callId: "call-1",
|
||||
providerCallId: "CA-speak",
|
||||
provider: "twilio",
|
||||
direction: "outbound",
|
||||
state: "ringing",
|
||||
from: "+15550001234",
|
||||
to: "+15550009999",
|
||||
startedAt: Date.now(),
|
||||
transcript: [],
|
||||
processedEventIds: [],
|
||||
metadata: {},
|
||||
}),
|
||||
);
|
||||
const handler = makeHandler(undefined, {
|
||||
manager: {
|
||||
getCallByProviderCallId,
|
||||
},
|
||||
realtimeProvider: makeRealtimeProvider(createBridge),
|
||||
});
|
||||
const server = await startRealtimeServer(handler);
|
||||
|
||||
try {
|
||||
const ws = await connectWs(server.url);
|
||||
try {
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
event: "start",
|
||||
start: { streamSid: "MZ-speak", callSid: "CA-speak" },
|
||||
}),
|
||||
);
|
||||
await vi.waitFor(() => {
|
||||
expect(createBridge).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
expect(handler.speak("call-1", "Say exactly: hello from Meet.")).toEqual({
|
||||
success: true,
|
||||
});
|
||||
expect(triggerGreeting).toHaveBeenCalledWith("Say exactly: hello from Meet.");
|
||||
} finally {
|
||||
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
|
||||
ws.close();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it("submits continuing responses only for realtime agent consult calls", async () => {
|
||||
let callbacks:
|
||||
| {
|
||||
|
||||
@@ -41,7 +41,7 @@ function buildGreetingInstructions(
|
||||
): string | undefined {
|
||||
const trimmedGreeting = greeting?.trim();
|
||||
if (!trimmedGreeting) {
|
||||
return baseInstructions;
|
||||
return undefined;
|
||||
}
|
||||
const intro =
|
||||
"Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
|
||||
@@ -64,9 +64,15 @@ type CallRegistration = {
|
||||
|
||||
type ActiveRealtimeVoiceBridge = RealtimeVoiceBridgeSession;
|
||||
|
||||
type RealtimeSpeakResult = {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export class RealtimeCallHandler {
|
||||
private readonly toolHandlers = new Map<string, ToolHandlerFn>();
|
||||
private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
|
||||
private readonly activeBridgesByCallId = new Map<string, ActiveRealtimeVoiceBridge>();
|
||||
private publicOrigin: string | null = null;
|
||||
private publicPathPrefix = "";
|
||||
|
||||
@@ -199,6 +205,19 @@ export class RealtimeCallHandler {
|
||||
this.toolHandlers.set(name, fn);
|
||||
}
|
||||
|
||||
speak(callId: string, instructions: string): RealtimeSpeakResult {
|
||||
const bridge = this.activeBridgesByCallId.get(callId);
|
||||
if (!bridge) {
|
||||
return { success: false, error: "No active realtime bridge for call" };
|
||||
}
|
||||
try {
|
||||
bridge.triggerGreeting(instructions);
|
||||
return { success: true };
|
||||
} catch (error) {
|
||||
return { success: false, error: formatErrorMessage(error) };
|
||||
}
|
||||
}
|
||||
|
||||
private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
|
||||
const token = randomUUID();
|
||||
this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
|
||||
@@ -254,7 +273,7 @@ export class RealtimeCallHandler {
|
||||
instructions: this.config.instructions,
|
||||
tools: this.config.tools,
|
||||
initialGreetingInstructions,
|
||||
triggerGreetingOnReady: true,
|
||||
triggerGreetingOnReady: Boolean(initialGreetingInstructions),
|
||||
audioSink: {
|
||||
isOpen: () => ws.readyState === WebSocket.OPEN,
|
||||
sendAudio: (muLaw) => {
|
||||
@@ -312,6 +331,8 @@ export class RealtimeCallHandler {
|
||||
console.error("[voice-call] realtime voice error:", error.message);
|
||||
},
|
||||
onClose: (reason) => {
|
||||
this.activeBridgesByCallId.delete(callId);
|
||||
this.activeBridgesByCallId.delete(callSid);
|
||||
if (reason !== "error") {
|
||||
return;
|
||||
}
|
||||
@@ -330,6 +351,14 @@ export class RealtimeCallHandler {
|
||||
});
|
||||
},
|
||||
});
|
||||
this.activeBridgesByCallId.set(callId, bridge);
|
||||
this.activeBridgesByCallId.set(callSid, bridge);
|
||||
const closeBridge = bridge.close.bind(bridge);
|
||||
bridge.close = () => {
|
||||
this.activeBridgesByCallId.delete(callId);
|
||||
this.activeBridgesByCallId.delete(callSid);
|
||||
closeBridge();
|
||||
};
|
||||
|
||||
bridge.connect().catch((error: Error) => {
|
||||
console.error("[voice-call] Failed to connect realtime bridge:", error);
|
||||
|
||||
Reference in New Issue
Block a user