fix: delay meet twilio intro speech

This commit is contained in:
Peter Steinberger
2026-05-01 06:55:16 +01:00
parent e8810c04a4
commit ffcc0d1fe1
15 changed files with 354 additions and 21 deletions

View File

@@ -12,8 +12,13 @@ import {
import { CREATE_MEET_FROM_BROWSER_SCRIPT } from "./src/transports/chrome-create.js";
const voiceCallMocks = vi.hoisted(() => ({
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
callId: "call-1",
dtmfSent: true,
introSent: true,
})),
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
}));
const fetchGuardMocks = vi.hoisted(() => ({
@@ -38,6 +43,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
vi.mock("./src/voice-call-gateway.js", () => ({
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
}));
function setup(

View File

@@ -35,8 +35,13 @@ import { buildMeetDtmfSequence, normalizeDialInNumber } from "./src/transports/t
import type { GoogleMeetSession } from "./src/transports/types.js";
const voiceCallMocks = vi.hoisted(() => ({
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
callId: "call-1",
dtmfSent: true,
introSent: true,
})),
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
}));
const fetchGuardMocks = vi.hoisted(() => ({
@@ -61,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
vi.mock("./src/voice-call-gateway.js", () => ({
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
}));
function setup(
@@ -348,7 +354,12 @@ describe("google-meet plugin", () => {
"BlackHole 2ch",
],
},
voiceCall: { enabled: true, requestTimeoutMs: 30000, dtmfDelayMs: 2500 },
voiceCall: {
enabled: true,
requestTimeoutMs: 30000,
dtmfDelayMs: 2500,
postDtmfSpeechDelayMs: 5000,
},
realtime: {
provider: "openai",
introMessage: "Say exactly: I'm here and listening.",
@@ -955,12 +966,14 @@ describe("google-meet plugin", () => {
dtmfSequence: "123456#",
voiceCallId: "call-1",
dtmfSent: true,
introSent: true,
},
});
expect(voiceCallMocks.joinMeetViaVoiceCallGateway).toHaveBeenCalledWith({
config: expect.objectContaining({ defaultTransport: "twilio" }),
dialInNumber: "+15551234567",
dtmfSequence: "123456#",
message: "Say exactly: I'm here and listening.",
});
});
@@ -984,6 +997,32 @@ describe("google-meet plugin", () => {
});
});
it("delegates Twilio session speech through voice-call", async () => {
const { tools } = setup({ defaultTransport: "twilio" });
const tool = tools[0] as {
execute: (id: string, params: unknown) => Promise<{ details: { session: { id: string } } }>;
};
const joined = await tool.execute("id", {
action: "join",
url: "https://meet.google.com/abc-defg-hij",
dialInNumber: "+15551234567",
pin: "123456",
});
const spoken = await tool.execute("id", {
action: "speak",
sessionId: joined.details.session.id,
message: "Say exactly: hello after joining.",
});
expect(spoken.details).toMatchObject({ spoken: true });
expect(voiceCallMocks.speakMeetViaVoiceCallGateway).toHaveBeenCalledWith({
config: expect.objectContaining({ defaultTransport: "twilio" }),
callId: "call-1",
message: "Say exactly: hello after joining.",
});
});
it("reports setup status through the tool", async () => {
const originalPlatform = process.platform;
Object.defineProperty(process, "platform", { value: "darwin" });

View File

@@ -119,6 +119,10 @@ const googleMeetConfigSchema = {
advanced: true,
},
"voiceCall.dtmfDelayMs": { label: "DTMF Delay (ms)", advanced: true },
"voiceCall.postDtmfSpeechDelayMs": {
label: "Post-DTMF Speech Delay (ms)",
advanced: true,
},
"voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true },
"realtime.provider": {
label: "Realtime Provider",

View File

@@ -52,6 +52,7 @@ export type GoogleMeetConfig = {
token?: string;
requestTimeoutMs: number;
dtmfDelayMs: number;
postDtmfSpeechDelayMs: number;
introMessage?: string;
};
realtime: {
@@ -181,6 +182,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
enabled: true,
requestTimeoutMs: 30_000,
dtmfDelayMs: 2_500,
postDtmfSpeechDelayMs: 5_000,
},
realtime: {
provider: "openai",
@@ -432,6 +434,10 @@ export function resolveGoogleMeetConfigWithEnv(
voiceCall.dtmfDelayMs,
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.dtmfDelayMs,
),
postDtmfSpeechDelayMs: resolveNumber(
voiceCall.postDtmfSpeechDelayMs,
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.postDtmfSpeechDelayMs,
),
introMessage: normalizeOptionalString(voiceCall.introMessage),
},
realtime: {

View File

@@ -21,7 +21,11 @@ import type {
GoogleMeetJoinResult,
GoogleMeetSession,
} from "./transports/types.js";
import { endMeetVoiceCallGatewayCall, joinMeetViaVoiceCallGateway } from "./voice-call-gateway.js";
import {
endMeetVoiceCallGatewayCall,
joinMeetViaVoiceCallGateway,
speakMeetViaVoiceCallGateway,
} from "./voice-call-gateway.js";
function nowIso(): string {
return new Date().toISOString();
@@ -301,6 +305,7 @@ export class GoogleMeetRuntime {
return { session: reusable, spoken };
}
const createdAt = nowIso();
let delegatedTwilioSpoken = false;
const session: GoogleMeetSession = {
id: `meet_${randomUUID()}`,
@@ -398,14 +403,22 @@ export class GoogleMeetRuntime {
config: this.params.config,
dialInNumber,
dtmfSequence,
message:
mode === "realtime"
? (request.message ??
this.params.config.voiceCall.introMessage ??
this.params.config.realtime.introMessage)
: undefined,
})
: undefined;
delegatedTwilioSpoken = Boolean(voiceCallResult?.introSent);
session.twilio = {
dialInNumber,
pinProvided: Boolean(request.pin ?? this.params.config.twilio.defaultPin),
dtmfSequence,
voiceCallId: voiceCallResult?.callId,
dtmfSent: voiceCallResult?.dtmfSent,
introSent: voiceCallResult?.introSent,
};
if (voiceCallResult?.callId) {
this.#sessionStops.set(session.id, async () => {
@@ -428,9 +441,11 @@ export class GoogleMeetRuntime {
this.#sessions.set(session.id, session);
const spoken =
mode === "realtime" && speechInstructions
? (await this.speak(session.id, speechInstructions)).spoken
: false;
transport === "twilio"
? delegatedTwilioSpoken
: mode === "realtime" && speechInstructions
? (await this.speak(session.id, speechInstructions)).spoken
: false;
return { session, spoken };
}
@@ -459,6 +474,20 @@ export class GoogleMeetRuntime {
if (!session) {
return { found: false, spoken: false };
}
if (session.transport === "twilio" && session.twilio?.voiceCallId) {
await speakMeetViaVoiceCallGateway({
config: this.params.config,
callId: session.twilio.voiceCallId,
message:
instructions ||
this.params.config.voiceCall.introMessage ||
this.params.config.realtime.introMessage ||
"",
});
session.twilio.introSent = true;
session.updatedAt = nowIso();
return { found: true, spoken: true, session };
}
await this.#refreshBrowserHealthForChromeSession(session);
const speak = this.#sessionSpeakers.get(sessionId);
if (!speak || session.state !== "active") {

View File

@@ -86,6 +86,7 @@ export type GoogleMeetSession = {
dtmfSequence?: string;
voiceCallId?: string;
dtmfSent?: boolean;
introSent?: boolean;
};
notes: string[];
};

View File

@@ -27,25 +27,49 @@ describe("Google Meet voice-call gateway", () => {
gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
});
it("starts Twilio Meet calls in conversation mode with the realtime intro by default", async () => {
it("starts Twilio Meet calls silently, sends DTMF, then speaks the realtime intro", async () => {
const config = resolveGoogleMeetConfig({
voiceCall: { gatewayUrl: "ws://127.0.0.1:18789" },
voiceCall: {
gatewayUrl: "ws://127.0.0.1:18789",
dtmfDelayMs: 1,
postDtmfSpeechDelayMs: 1,
},
realtime: { introMessage: "Say exactly: I'm here and listening." },
});
await joinMeetViaVoiceCallGateway({
config,
dialInNumber: "+15551234567",
dtmfSequence: "123456#",
message: "Say exactly: I'm here and listening.",
});
expect(gatewayMocks.request).toHaveBeenCalledWith(
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
1,
"voicecall.start",
{
to: "+15551234567",
message: "Say exactly: I'm here and listening.",
mode: "conversation",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
2,
"voicecall.dtmf",
{
callId: "call-1",
digits: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
3,
"voicecall.speak",
{
callId: "call-1",
message: "Say exactly: I'm here and listening.",
},
{ timeoutMs: 30_000 },
);
});
});

View File

@@ -13,9 +13,15 @@ type VoiceCallStartResult = {
error?: string;
};
type VoiceCallSpeakResult = {
success?: boolean;
error?: string;
};
export type VoiceCallMeetJoinResult = {
callId: string;
dtmfSent: boolean;
introSent: boolean;
};
async function createConnectedGatewayClient(
@@ -67,6 +73,7 @@ export async function joinMeetViaVoiceCallGateway(params: {
config: GoogleMeetConfig;
dialInNumber: string;
dtmfSequence?: string;
message?: string;
}): Promise<VoiceCallMeetJoinResult> {
let client: VoiceCallGatewayClient | undefined;
@@ -76,7 +83,6 @@ export async function joinMeetViaVoiceCallGateway(params: {
"voicecall.start",
{
to: params.dialInNumber,
message: params.config.voiceCall.introMessage ?? params.config.realtime.introMessage,
mode: "conversation",
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
@@ -95,7 +101,25 @@ export async function joinMeetViaVoiceCallGateway(params: {
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
);
}
return { callId: start.callId, dtmfSent: Boolean(params.dtmfSequence) };
if (params.message) {
await sleep(params.config.voiceCall.postDtmfSpeechDelayMs);
const spoken = (await client.request(
"voicecall.speak",
{
callId: start.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
}
return {
callId: start.callId,
dtmfSent: Boolean(params.dtmfSequence),
introSent: Boolean(params.message),
};
} finally {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
@@ -120,3 +144,28 @@ export async function endMeetVoiceCallGatewayCall(params: {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
}
export async function speakMeetViaVoiceCallGateway(params: {
config: GoogleMeetConfig;
callId: string;
message: string;
}): Promise<void> {
let client: VoiceCallGatewayClient | undefined;
try {
client = await createConnectedGatewayClient(params.config);
const spoken = (await client.request(
"voicecall.speak",
{
callId: params.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
} finally {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
}

View File

@@ -369,12 +369,27 @@ export default definePluginEntry({
"voicecall.speak",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
await respondToCallMessageAction({
requestParams: params,
respond,
action: (request) => request.rt.manager.speak(request.callId, request.message),
failure: "speak failed",
});
const request = await resolveCallMessageRequest(params);
if ("error" in request) {
respond(false, { error: request.error });
return;
}
if (request.rt.config.realtime.enabled) {
const realtimeResult = request.rt.webhookServer.speakRealtime(
request.callId,
request.message,
);
if (realtimeResult.success) {
respond(true, { success: true });
return;
}
}
const result = await request.rt.manager.speak(request.callId, request.message);
if (!result.success) {
respond(false, { error: result.error || "speak failed" });
return;
}
respond(true, { success: true });
} catch (err) {
sendError(respond, err);
}

View File

@@ -195,6 +195,13 @@ export class VoiceCallWebhookServer {
return this.realtimeHandler;
}
speakRealtime(callId: string, instructions: string): { success: boolean; error?: string } {
if (!this.realtimeHandler) {
return { success: false, error: "Realtime voice handler is not configured" };
}
return this.realtimeHandler.speak(callId, instructions);
}
setRealtimeHandler(handler: RealtimeCallHandler): void {
this.realtimeHandler = handler;
}

View File

@@ -214,6 +214,121 @@ describe("RealtimeCallHandler path routing", () => {
}
});
it("does not emit an outbound realtime greeting without an initial message", async () => {
let callbacks:
| {
onReady?: () => void;
}
| undefined;
const triggerGreeting = vi.fn();
const createBridge = vi.fn(
(request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0]) => {
callbacks = request;
return makeBridge({ triggerGreeting });
},
);
const getCallByProviderCallId = vi.fn(
(): CallRecord => ({
callId: "call-1",
providerCallId: "CA-silent",
provider: "twilio",
direction: "outbound",
state: "ringing",
from: "+15550001234",
to: "+15550009999",
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {},
}),
);
const handler = makeHandler(undefined, {
manager: {
getCallByProviderCallId,
},
realtimeProvider: makeRealtimeProvider(createBridge),
});
const server = await startRealtimeServer(handler);
try {
const ws = await connectWs(server.url);
try {
ws.send(
JSON.stringify({
event: "start",
start: { streamSid: "MZ-silent", callSid: "CA-silent" },
}),
);
await vi.waitFor(() => {
expect(createBridge).toHaveBeenCalled();
});
callbacks?.onReady?.();
expect(triggerGreeting).not.toHaveBeenCalled();
} finally {
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
ws.close();
}
}
} finally {
await server.close();
}
});
it("speaks through the active outbound realtime bridge by call id", async () => {
const triggerGreeting = vi.fn();
const createBridge = vi.fn(() => makeBridge({ triggerGreeting }));
const getCallByProviderCallId = vi.fn(
(): CallRecord => ({
callId: "call-1",
providerCallId: "CA-speak",
provider: "twilio",
direction: "outbound",
state: "ringing",
from: "+15550001234",
to: "+15550009999",
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {},
}),
);
const handler = makeHandler(undefined, {
manager: {
getCallByProviderCallId,
},
realtimeProvider: makeRealtimeProvider(createBridge),
});
const server = await startRealtimeServer(handler);
try {
const ws = await connectWs(server.url);
try {
ws.send(
JSON.stringify({
event: "start",
start: { streamSid: "MZ-speak", callSid: "CA-speak" },
}),
);
await vi.waitFor(() => {
expect(createBridge).toHaveBeenCalled();
});
expect(handler.speak("call-1", "Say exactly: hello from Meet.")).toEqual({
success: true,
});
expect(triggerGreeting).toHaveBeenCalledWith("Say exactly: hello from Meet.");
} finally {
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
ws.close();
}
}
} finally {
await server.close();
}
});
it("submits continuing responses only for realtime agent consult calls", async () => {
let callbacks:
| {

View File

@@ -41,7 +41,7 @@ function buildGreetingInstructions(
): string | undefined {
const trimmedGreeting = greeting?.trim();
if (!trimmedGreeting) {
return baseInstructions;
return undefined;
}
const intro =
"Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
@@ -64,9 +64,15 @@ type CallRegistration = {
type ActiveRealtimeVoiceBridge = RealtimeVoiceBridgeSession;
type RealtimeSpeakResult = {
success: boolean;
error?: string;
};
export class RealtimeCallHandler {
private readonly toolHandlers = new Map<string, ToolHandlerFn>();
private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
private readonly activeBridgesByCallId = new Map<string, ActiveRealtimeVoiceBridge>();
private publicOrigin: string | null = null;
private publicPathPrefix = "";
@@ -199,6 +205,19 @@ export class RealtimeCallHandler {
this.toolHandlers.set(name, fn);
}
speak(callId: string, instructions: string): RealtimeSpeakResult {
const bridge = this.activeBridgesByCallId.get(callId);
if (!bridge) {
return { success: false, error: "No active realtime bridge for call" };
}
try {
bridge.triggerGreeting(instructions);
return { success: true };
} catch (error) {
return { success: false, error: formatErrorMessage(error) };
}
}
private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
const token = randomUUID();
this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
@@ -254,7 +273,7 @@ export class RealtimeCallHandler {
instructions: this.config.instructions,
tools: this.config.tools,
initialGreetingInstructions,
triggerGreetingOnReady: true,
triggerGreetingOnReady: Boolean(initialGreetingInstructions),
audioSink: {
isOpen: () => ws.readyState === WebSocket.OPEN,
sendAudio: (muLaw) => {
@@ -312,6 +331,8 @@ export class RealtimeCallHandler {
console.error("[voice-call] realtime voice error:", error.message);
},
onClose: (reason) => {
this.activeBridgesByCallId.delete(callId);
this.activeBridgesByCallId.delete(callSid);
if (reason !== "error") {
return;
}
@@ -330,6 +351,14 @@ export class RealtimeCallHandler {
});
},
});
this.activeBridgesByCallId.set(callId, bridge);
this.activeBridgesByCallId.set(callSid, bridge);
const closeBridge = bridge.close.bind(bridge);
bridge.close = () => {
this.activeBridgesByCallId.delete(callId);
this.activeBridgesByCallId.delete(callSid);
closeBridge();
};
bridge.connect().catch((error: Error) => {
console.error("[voice-call] Failed to connect realtime bridge:", error);