fix: delay meet twilio intro speech

This commit is contained in:
Peter Steinberger
2026-05-01 06:55:16 +01:00
parent e8810c04a4
commit ffcc0d1fe1
15 changed files with 354 additions and 21 deletions

View File

@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Google Meet/Voice Call: defer Twilio dial-in intro speech until after Meet DTMF entry and route delayed speech through the active realtime Voice Call bridge. Thanks @donkeykong91 and @PfanP.
- Google Meet/Voice Call: make Twilio setup preflight honor explicit `--transport twilio` and fail local/private Voice Call webhook URLs before joins. Thanks @donkeykong91 and @PfanP.
- Voice Call/Twilio: retry transient 21220 live-call TwiML updates and catch answered-path initial-greeting failures, so a fast answered callback no longer crashes the Gateway or drops the Twilio greeting/listen transition. (#74606) Thanks @Sivan22.
- Voice Call/Twilio: register accepted media streams immediately but wait for realtime transcription readiness before speaking the initial greeting, so reconnect grace handling stays live while OpenAI STT startup is no longer starved by TTS. Fixes #75197. (#75257) Thanks @donkeykong91 and @PfanP.

View File

@@ -1411,6 +1411,9 @@ participant:
the PIN.
- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for
example `wwww123456#`.
- If the participant joins but you miss the first spoken line, increase
`plugins.entries.google-meet.config.voiceCall.postDtmfSpeechDelayMs` so the
intro is spoken after Meet finishes admitting the phone participant.
If webhooks do not arrive, debug the Voice Call plugin first: the provider must
reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel.

View File

@@ -766,6 +766,11 @@ If Voice Call is green but the Meet participant never joins, check the Meet
dial-in number, PIN, and `--dtmf-sequence`. The phone call can be healthy while
the meeting rejects or ignores an incorrect DTMF sequence.
Google Meet starts Voice Call silently, sends DTMF, then asks Voice Call to
speak the intro after `voiceCall.postDtmfSpeechDelayMs`. Increase that delay in
the Google Meet plugin config if the first line is spoken before Meet admits the
phone participant.
### Realtime call has no speech
Confirm only one audio mode is enabled. `realtime.enabled` and

View File

@@ -12,8 +12,13 @@ import {
import { CREATE_MEET_FROM_BROWSER_SCRIPT } from "./src/transports/chrome-create.js";
const voiceCallMocks = vi.hoisted(() => ({
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
callId: "call-1",
dtmfSent: true,
introSent: true,
})),
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
}));
const fetchGuardMocks = vi.hoisted(() => ({
@@ -38,6 +43,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
vi.mock("./src/voice-call-gateway.js", () => ({
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
}));
function setup(

View File

@@ -35,8 +35,13 @@ import { buildMeetDtmfSequence, normalizeDialInNumber } from "./src/transports/t
import type { GoogleMeetSession } from "./src/transports/types.js";
const voiceCallMocks = vi.hoisted(() => ({
joinMeetViaVoiceCallGateway: vi.fn(async () => ({ callId: "call-1", dtmfSent: true })),
joinMeetViaVoiceCallGateway: vi.fn(async () => ({
callId: "call-1",
dtmfSent: true,
introSent: true,
})),
endMeetVoiceCallGatewayCall: vi.fn(async () => {}),
speakMeetViaVoiceCallGateway: vi.fn(async () => {}),
}));
const fetchGuardMocks = vi.hoisted(() => ({
@@ -61,6 +66,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
vi.mock("./src/voice-call-gateway.js", () => ({
joinMeetViaVoiceCallGateway: voiceCallMocks.joinMeetViaVoiceCallGateway,
endMeetVoiceCallGatewayCall: voiceCallMocks.endMeetVoiceCallGatewayCall,
speakMeetViaVoiceCallGateway: voiceCallMocks.speakMeetViaVoiceCallGateway,
}));
function setup(
@@ -348,7 +354,12 @@ describe("google-meet plugin", () => {
"BlackHole 2ch",
],
},
voiceCall: { enabled: true, requestTimeoutMs: 30000, dtmfDelayMs: 2500 },
voiceCall: {
enabled: true,
requestTimeoutMs: 30000,
dtmfDelayMs: 2500,
postDtmfSpeechDelayMs: 5000,
},
realtime: {
provider: "openai",
introMessage: "Say exactly: I'm here and listening.",
@@ -955,12 +966,14 @@ describe("google-meet plugin", () => {
dtmfSequence: "123456#",
voiceCallId: "call-1",
dtmfSent: true,
introSent: true,
},
});
expect(voiceCallMocks.joinMeetViaVoiceCallGateway).toHaveBeenCalledWith({
config: expect.objectContaining({ defaultTransport: "twilio" }),
dialInNumber: "+15551234567",
dtmfSequence: "123456#",
message: "Say exactly: I'm here and listening.",
});
});
@@ -984,6 +997,32 @@ describe("google-meet plugin", () => {
});
});
it("delegates Twilio session speech through voice-call", async () => {
const { tools } = setup({ defaultTransport: "twilio" });
const tool = tools[0] as {
execute: (id: string, params: unknown) => Promise<{ details: { session: { id: string } } }>;
};
const joined = await tool.execute("id", {
action: "join",
url: "https://meet.google.com/abc-defg-hij",
dialInNumber: "+15551234567",
pin: "123456",
});
const spoken = await tool.execute("id", {
action: "speak",
sessionId: joined.details.session.id,
message: "Say exactly: hello after joining.",
});
expect(spoken.details).toMatchObject({ spoken: true });
expect(voiceCallMocks.speakMeetViaVoiceCallGateway).toHaveBeenCalledWith({
config: expect.objectContaining({ defaultTransport: "twilio" }),
callId: "call-1",
message: "Say exactly: hello after joining.",
});
});
it("reports setup status through the tool", async () => {
const originalPlatform = process.platform;
Object.defineProperty(process, "platform", { value: "darwin" });

View File

@@ -119,6 +119,10 @@ const googleMeetConfigSchema = {
advanced: true,
},
"voiceCall.dtmfDelayMs": { label: "DTMF Delay (ms)", advanced: true },
"voiceCall.postDtmfSpeechDelayMs": {
label: "Post-DTMF Speech Delay (ms)",
advanced: true,
},
"voiceCall.introMessage": { label: "Voice Call Intro Message", advanced: true },
"realtime.provider": {
label: "Realtime Provider",

View File

@@ -52,6 +52,7 @@ export type GoogleMeetConfig = {
token?: string;
requestTimeoutMs: number;
dtmfDelayMs: number;
postDtmfSpeechDelayMs: number;
introMessage?: string;
};
realtime: {
@@ -181,6 +182,7 @@ export const DEFAULT_GOOGLE_MEET_CONFIG: GoogleMeetConfig = {
enabled: true,
requestTimeoutMs: 30_000,
dtmfDelayMs: 2_500,
postDtmfSpeechDelayMs: 5_000,
},
realtime: {
provider: "openai",
@@ -432,6 +434,10 @@ export function resolveGoogleMeetConfigWithEnv(
voiceCall.dtmfDelayMs,
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.dtmfDelayMs,
),
postDtmfSpeechDelayMs: resolveNumber(
voiceCall.postDtmfSpeechDelayMs,
DEFAULT_GOOGLE_MEET_CONFIG.voiceCall.postDtmfSpeechDelayMs,
),
introMessage: normalizeOptionalString(voiceCall.introMessage),
},
realtime: {

View File

@@ -21,7 +21,11 @@ import type {
GoogleMeetJoinResult,
GoogleMeetSession,
} from "./transports/types.js";
import { endMeetVoiceCallGatewayCall, joinMeetViaVoiceCallGateway } from "./voice-call-gateway.js";
import {
endMeetVoiceCallGatewayCall,
joinMeetViaVoiceCallGateway,
speakMeetViaVoiceCallGateway,
} from "./voice-call-gateway.js";
function nowIso(): string {
return new Date().toISOString();
@@ -301,6 +305,7 @@ export class GoogleMeetRuntime {
return { session: reusable, spoken };
}
const createdAt = nowIso();
let delegatedTwilioSpoken = false;
const session: GoogleMeetSession = {
id: `meet_${randomUUID()}`,
@@ -398,14 +403,22 @@ export class GoogleMeetRuntime {
config: this.params.config,
dialInNumber,
dtmfSequence,
message:
mode === "realtime"
? (request.message ??
this.params.config.voiceCall.introMessage ??
this.params.config.realtime.introMessage)
: undefined,
})
: undefined;
delegatedTwilioSpoken = Boolean(voiceCallResult?.introSent);
session.twilio = {
dialInNumber,
pinProvided: Boolean(request.pin ?? this.params.config.twilio.defaultPin),
dtmfSequence,
voiceCallId: voiceCallResult?.callId,
dtmfSent: voiceCallResult?.dtmfSent,
introSent: voiceCallResult?.introSent,
};
if (voiceCallResult?.callId) {
this.#sessionStops.set(session.id, async () => {
@@ -428,9 +441,11 @@ export class GoogleMeetRuntime {
this.#sessions.set(session.id, session);
const spoken =
mode === "realtime" && speechInstructions
? (await this.speak(session.id, speechInstructions)).spoken
: false;
transport === "twilio"
? delegatedTwilioSpoken
: mode === "realtime" && speechInstructions
? (await this.speak(session.id, speechInstructions)).spoken
: false;
return { session, spoken };
}
@@ -459,6 +474,20 @@ export class GoogleMeetRuntime {
if (!session) {
return { found: false, spoken: false };
}
if (session.transport === "twilio" && session.twilio?.voiceCallId) {
await speakMeetViaVoiceCallGateway({
config: this.params.config,
callId: session.twilio.voiceCallId,
message:
instructions ||
this.params.config.voiceCall.introMessage ||
this.params.config.realtime.introMessage ||
"",
});
session.twilio.introSent = true;
session.updatedAt = nowIso();
return { found: true, spoken: true, session };
}
await this.#refreshBrowserHealthForChromeSession(session);
const speak = this.#sessionSpeakers.get(sessionId);
if (!speak || session.state !== "active") {

View File

@@ -86,6 +86,7 @@ export type GoogleMeetSession = {
dtmfSequence?: string;
voiceCallId?: string;
dtmfSent?: boolean;
introSent?: boolean;
};
notes: string[];
};

View File

@@ -27,25 +27,49 @@ describe("Google Meet voice-call gateway", () => {
gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
});
it("starts Twilio Meet calls in conversation mode with the realtime intro by default", async () => {
it("starts Twilio Meet calls silently, sends DTMF, then speaks the realtime intro", async () => {
const config = resolveGoogleMeetConfig({
voiceCall: { gatewayUrl: "ws://127.0.0.1:18789" },
voiceCall: {
gatewayUrl: "ws://127.0.0.1:18789",
dtmfDelayMs: 1,
postDtmfSpeechDelayMs: 1,
},
realtime: { introMessage: "Say exactly: I'm here and listening." },
});
await joinMeetViaVoiceCallGateway({
config,
dialInNumber: "+15551234567",
dtmfSequence: "123456#",
message: "Say exactly: I'm here and listening.",
});
expect(gatewayMocks.request).toHaveBeenCalledWith(
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
1,
"voicecall.start",
{
to: "+15551234567",
message: "Say exactly: I'm here and listening.",
mode: "conversation",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
2,
"voicecall.dtmf",
{
callId: "call-1",
digits: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
3,
"voicecall.speak",
{
callId: "call-1",
message: "Say exactly: I'm here and listening.",
},
{ timeoutMs: 30_000 },
);
});
});

View File

@@ -13,9 +13,15 @@ type VoiceCallStartResult = {
error?: string;
};
type VoiceCallSpeakResult = {
success?: boolean;
error?: string;
};
export type VoiceCallMeetJoinResult = {
callId: string;
dtmfSent: boolean;
introSent: boolean;
};
async function createConnectedGatewayClient(
@@ -67,6 +73,7 @@ export async function joinMeetViaVoiceCallGateway(params: {
config: GoogleMeetConfig;
dialInNumber: string;
dtmfSequence?: string;
message?: string;
}): Promise<VoiceCallMeetJoinResult> {
let client: VoiceCallGatewayClient | undefined;
@@ -76,7 +83,6 @@ export async function joinMeetViaVoiceCallGateway(params: {
"voicecall.start",
{
to: params.dialInNumber,
message: params.config.voiceCall.introMessage ?? params.config.realtime.introMessage,
mode: "conversation",
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
@@ -95,7 +101,25 @@ export async function joinMeetViaVoiceCallGateway(params: {
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
);
}
return { callId: start.callId, dtmfSent: Boolean(params.dtmfSequence) };
if (params.message) {
await sleep(params.config.voiceCall.postDtmfSpeechDelayMs);
const spoken = (await client.request(
"voicecall.speak",
{
callId: start.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
}
return {
callId: start.callId,
dtmfSent: Boolean(params.dtmfSequence),
introSent: Boolean(params.message),
};
} finally {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
@@ -120,3 +144,28 @@ export async function endMeetVoiceCallGatewayCall(params: {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
}
export async function speakMeetViaVoiceCallGateway(params: {
config: GoogleMeetConfig;
callId: string;
message: string;
}): Promise<void> {
let client: VoiceCallGatewayClient | undefined;
try {
client = await createConnectedGatewayClient(params.config);
const spoken = (await client.request(
"voicecall.speak",
{
callId: params.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
} finally {
await client?.stopAndWait({ timeoutMs: 1_000 });
}
}

View File

@@ -369,12 +369,27 @@ export default definePluginEntry({
"voicecall.speak",
async ({ params, respond }: GatewayRequestHandlerOptions) => {
try {
await respondToCallMessageAction({
requestParams: params,
respond,
action: (request) => request.rt.manager.speak(request.callId, request.message),
failure: "speak failed",
});
const request = await resolveCallMessageRequest(params);
if ("error" in request) {
respond(false, { error: request.error });
return;
}
if (request.rt.config.realtime.enabled) {
const realtimeResult = request.rt.webhookServer.speakRealtime(
request.callId,
request.message,
);
if (realtimeResult.success) {
respond(true, { success: true });
return;
}
}
const result = await request.rt.manager.speak(request.callId, request.message);
if (!result.success) {
respond(false, { error: result.error || "speak failed" });
return;
}
respond(true, { success: true });
} catch (err) {
sendError(respond, err);
}

View File

@@ -195,6 +195,13 @@ export class VoiceCallWebhookServer {
return this.realtimeHandler;
}
speakRealtime(callId: string, instructions: string): { success: boolean; error?: string } {
if (!this.realtimeHandler) {
return { success: false, error: "Realtime voice handler is not configured" };
}
return this.realtimeHandler.speak(callId, instructions);
}
setRealtimeHandler(handler: RealtimeCallHandler): void {
this.realtimeHandler = handler;
}

View File

@@ -214,6 +214,121 @@ describe("RealtimeCallHandler path routing", () => {
}
});
it("does not emit an outbound realtime greeting without an initial message", async () => {
let callbacks:
| {
onReady?: () => void;
}
| undefined;
const triggerGreeting = vi.fn();
const createBridge = vi.fn(
(request: Parameters<RealtimeVoiceProviderPlugin["createBridge"]>[0]) => {
callbacks = request;
return makeBridge({ triggerGreeting });
},
);
const getCallByProviderCallId = vi.fn(
(): CallRecord => ({
callId: "call-1",
providerCallId: "CA-silent",
provider: "twilio",
direction: "outbound",
state: "ringing",
from: "+15550001234",
to: "+15550009999",
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {},
}),
);
const handler = makeHandler(undefined, {
manager: {
getCallByProviderCallId,
},
realtimeProvider: makeRealtimeProvider(createBridge),
});
const server = await startRealtimeServer(handler);
try {
const ws = await connectWs(server.url);
try {
ws.send(
JSON.stringify({
event: "start",
start: { streamSid: "MZ-silent", callSid: "CA-silent" },
}),
);
await vi.waitFor(() => {
expect(createBridge).toHaveBeenCalled();
});
callbacks?.onReady?.();
expect(triggerGreeting).not.toHaveBeenCalled();
} finally {
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
ws.close();
}
}
} finally {
await server.close();
}
});
it("speaks through the active outbound realtime bridge by call id", async () => {
const triggerGreeting = vi.fn();
const createBridge = vi.fn(() => makeBridge({ triggerGreeting }));
const getCallByProviderCallId = vi.fn(
(): CallRecord => ({
callId: "call-1",
providerCallId: "CA-speak",
provider: "twilio",
direction: "outbound",
state: "ringing",
from: "+15550001234",
to: "+15550009999",
startedAt: Date.now(),
transcript: [],
processedEventIds: [],
metadata: {},
}),
);
const handler = makeHandler(undefined, {
manager: {
getCallByProviderCallId,
},
realtimeProvider: makeRealtimeProvider(createBridge),
});
const server = await startRealtimeServer(handler);
try {
const ws = await connectWs(server.url);
try {
ws.send(
JSON.stringify({
event: "start",
start: { streamSid: "MZ-speak", callSid: "CA-speak" },
}),
);
await vi.waitFor(() => {
expect(createBridge).toHaveBeenCalled();
});
expect(handler.speak("call-1", "Say exactly: hello from Meet.")).toEqual({
success: true,
});
expect(triggerGreeting).toHaveBeenCalledWith("Say exactly: hello from Meet.");
} finally {
if (ws.readyState !== WebSocket.CLOSED && ws.readyState !== WebSocket.CLOSING) {
ws.close();
}
}
} finally {
await server.close();
}
});
it("submits continuing responses only for realtime agent consult calls", async () => {
let callbacks:
| {

View File

@@ -41,7 +41,7 @@ function buildGreetingInstructions(
): string | undefined {
const trimmedGreeting = greeting?.trim();
if (!trimmedGreeting) {
return baseInstructions;
return undefined;
}
const intro =
"Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
@@ -64,9 +64,15 @@ type CallRegistration = {
type ActiveRealtimeVoiceBridge = RealtimeVoiceBridgeSession;
type RealtimeSpeakResult = {
success: boolean;
error?: string;
};
export class RealtimeCallHandler {
private readonly toolHandlers = new Map<string, ToolHandlerFn>();
private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
private readonly activeBridgesByCallId = new Map<string, ActiveRealtimeVoiceBridge>();
private publicOrigin: string | null = null;
private publicPathPrefix = "";
@@ -199,6 +205,19 @@ export class RealtimeCallHandler {
this.toolHandlers.set(name, fn);
}
speak(callId: string, instructions: string): RealtimeSpeakResult {
const bridge = this.activeBridgesByCallId.get(callId);
if (!bridge) {
return { success: false, error: "No active realtime bridge for call" };
}
try {
bridge.triggerGreeting(instructions);
return { success: true };
} catch (error) {
return { success: false, error: formatErrorMessage(error) };
}
}
private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
const token = randomUUID();
this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
@@ -254,7 +273,7 @@ export class RealtimeCallHandler {
instructions: this.config.instructions,
tools: this.config.tools,
initialGreetingInstructions,
triggerGreetingOnReady: true,
triggerGreetingOnReady: Boolean(initialGreetingInstructions),
audioSink: {
isOpen: () => ws.readyState === WebSocket.OPEN,
sendAudio: (muLaw) => {
@@ -312,6 +331,8 @@ export class RealtimeCallHandler {
console.error("[voice-call] realtime voice error:", error.message);
},
onClose: (reason) => {
this.activeBridgesByCallId.delete(callId);
this.activeBridgesByCallId.delete(callSid);
if (reason !== "error") {
return;
}
@@ -330,6 +351,14 @@ export class RealtimeCallHandler {
});
},
});
this.activeBridgesByCallId.set(callId, bridge);
this.activeBridgesByCallId.set(callSid, bridge);
const closeBridge = bridge.close.bind(bridge);
bridge.close = () => {
this.activeBridgesByCallId.delete(callId);
this.activeBridgesByCallId.delete(callSid);
closeBridge();
};
bridge.connect().catch((error: Error) => {
console.error("[voice-call] Failed to connect realtime bridge:", error);