fix: improve google meet twilio join sequencing

This commit is contained in:
Peter Steinberger
2026-05-02 10:56:08 +01:00
parent 59fb9e5ca7
commit 1634f91a35
7 changed files with 204 additions and 25 deletions

View File

@@ -50,6 +50,8 @@ Docs: https://docs.openclaw.ai
- Gateway/pricing: abort in-flight model pricing catalog fetches when Gateway shutdown stops the refresh loop, and avoid post-stop cache writes or refresh timers. Fixes #72208. Thanks @rzcq.
- Codex/app-server: make startup retry cleanup ownership-aware so concurrent Codex lanes cannot close another lane's freshly restarted shared app-server client. Thanks @vincentkoc.
- Google Meet/Twilio: report missing dial-in details during setup and explain that Twilio cannot join Meet URLs without a phone dial plan.
- Google Meet/Twilio: start the phone leg before sending Meet PIN DTMF, delay intro speech until after the post-connect dial sequence, and log each stage so operators can tell Twilio-leg audio from Meet-room audio.
- Voice Call: accept provider call IDs for gateway speak/continue requests and report ended-call state from history instead of returning a generic "Call not found" for stale calls.
- Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427.
- Plugins: clarify config-selected duplicate plugin override diagnostics and document manifest schema updates for bundled-plugin forks. Fixes #8582. Thanks @sachah.
- CLI backends/Claude: make live-session JSONL turn caps bounded and configurable via `reliability.outputLimits`, raising the default guard for tool-heavy Claude CLI turns while preserving memory limits. Fixes #75838. Thanks @hcordoba840.

View File

@@ -1548,19 +1548,21 @@ participant:
- Run `openclaw voicecall tail` and check that Twilio webhooks are arriving at
the Gateway.
- Run `openclaw logs --follow` and look for the Twilio Meet sequence: Google
Meet delegates the join, Voice Call stores pre-connect DTMF TwiML, serves
that initial TwiML, then serves realtime TwiML and starts the realtime bridge
with `initialGreeting=queued`.
Meet delegates the join, Voice Call starts the phone leg, Google Meet waits
`voiceCall.dtmfDelayMs`, sends DTMF with `voicecall.dtmf`, waits
`voiceCall.postDtmfSpeechDelayMs`, then requests intro speech with
`voicecall.speak`.
- Re-run `openclaw googlemeet setup --transport twilio`; a green setup check is
required but does not prove the meeting PIN sequence is correct.
- Confirm the dial-in number belongs to the same Meet invitation and region as
the PIN.
- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for
example `wwww123456#`.
- Increase `voiceCall.dtmfDelayMs` if Meet answers slowly or the call transcript
still shows the prompt asking for a PIN after DTMF was sent.
- If the participant joins but you do not hear the greeting, check
`openclaw logs --follow` for realtime TwiML, realtime bridge startup, and
`initialGreeting=queued`. The greeting is generated from the initial
`voicecall.start` message after the realtime bridge connects.
`openclaw logs --follow` for the post-DTMF `voicecall.speak` request and
either media-stream TTS playback or the Twilio `<Say>` fallback. If the call
transcript still contains "enter the meeting PIN", the phone leg has not joined
the Meet room yet, so meeting participants will not hear speech.
If webhooks do not arrive, debug the Voice Call plugin first: the provider must
reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel.

View File

@@ -491,7 +491,7 @@ export class GoogleMeetRuntime {
session.notes.push(
this.params.config.voiceCall.enabled
? dtmfSequence
? "Twilio transport delegated the call to the voice-call plugin and queued configured DTMF."
? "Twilio transport delegated the phone leg to the voice-call plugin, then sent configured DTMF after connect before speaking."
: "Twilio transport delegated the call to the voice-call plugin without configured DTMF."
: "Twilio transport is an explicit dial plan; voice-call delegation is disabled.",
);

View File

@@ -21,39 +21,59 @@ vi.mock("openclaw/plugin-sdk/gateway-runtime", () => ({
describe("Google Meet voice-call gateway", () => {
beforeEach(() => {
vi.useRealTimers();
gatewayMocks.request.mockReset();
gatewayMocks.request.mockResolvedValue({ callId: "call-1" });
gatewayMocks.stopAndWait.mockClear();
gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
});
it("starts Twilio Meet calls with pre-connect DTMF and intro metadata", async () => {
it("starts Twilio Meet calls, sends delayed DTMF, then speaks the intro", async () => {
const config = resolveGoogleMeetConfig({
voiceCall: {
gatewayUrl: "ws://127.0.0.1:18789",
dtmfDelayMs: 1,
postDtmfSpeechDelayMs: 2,
},
realtime: { introMessage: "Say exactly: I'm here and listening." },
});
await joinMeetViaVoiceCallGateway({
const join = joinMeetViaVoiceCallGateway({
config,
dialInNumber: "+15551234567",
dtmfSequence: "123456#",
message: "Say exactly: I'm here and listening.",
});
await join;
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
1,
"voicecall.start",
{
to: "+15551234567",
mode: "conversation",
message: "Say exactly: I'm here and listening.",
dtmfSequence: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenCalledTimes(1);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
2,
"voicecall.dtmf",
{
callId: "call-1",
digits: "123456#",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenNthCalledWith(
3,
"voicecall.speak",
{
callId: "call-1",
message: "Say exactly: I'm here and listening.",
},
{ timeoutMs: 30_000 },
);
expect(gatewayMocks.request).toHaveBeenCalledTimes(3);
});
});

View File

@@ -18,12 +18,24 @@ type VoiceCallSpeakResult = {
error?: string;
};
type VoiceCallDtmfResult = {
success?: boolean;
error?: string;
};
type VoiceCallMeetJoinResult = {
callId: string;
dtmfSent: boolean;
introSent: boolean;
};
function sleep(ms: number): Promise<void> {
if (ms <= 0) {
return Promise.resolve();
}
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function createConnectedGatewayClient(
config: GoogleMeetConfig,
): Promise<VoiceCallGatewayClient> {
@@ -81,15 +93,13 @@ export async function joinMeetViaVoiceCallGateway(params: {
try {
client = await createConnectedGatewayClient(params.config);
params.logger?.info(
`[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "yes" : "no"}, intro=${params.message ? "yes" : "no"})`,
`[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "post-connect" : "none"}, intro=${params.message ? "delayed" : "none"})`,
);
const start = (await client.request(
"voicecall.start",
{
to: params.dialInNumber,
mode: "conversation",
...(params.message ? { message: params.message } : {}),
...(params.dtmfSequence ? { dtmfSequence: params.dtmfSequence } : {}),
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallStartResult;
@@ -97,12 +107,60 @@ export async function joinMeetViaVoiceCallGateway(params: {
throw new Error(start.error || "voicecall.start did not return callId");
}
params.logger?.info(
`[google-meet] Voice Call Twilio join started: callId=${start.callId} dtmf=${params.dtmfSequence ? "yes" : "no"} intro=${params.message ? "yes" : "no"}`,
`[google-meet] Voice Call Twilio phone leg started: callId=${start.callId}`,
);
let dtmfSent = false;
if (params.dtmfSequence) {
const delayMs = params.config.voiceCall.dtmfDelayMs;
params.logger?.info(
`[google-meet] Waiting ${delayMs}ms before sending Meet DTMF for callId=${start.callId}`,
);
await sleep(delayMs);
const dtmf = (await client.request(
"voicecall.dtmf",
{
callId: start.callId,
digits: params.dtmfSequence,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallDtmfResult;
if (dtmf.success === false) {
throw new Error(dtmf.error || "voicecall.dtmf failed");
}
dtmfSent = true;
params.logger?.info(
`[google-meet] Meet DTMF sent after phone leg connected: callId=${start.callId} digits=${params.dtmfSequence.length}`,
);
}
let introSent = false;
if (params.message) {
const delayMs = params.dtmfSequence ? params.config.voiceCall.postDtmfSpeechDelayMs : 0;
if (delayMs > 0) {
params.logger?.info(
`[google-meet] Waiting ${delayMs}ms after Meet DTMF before speaking intro for callId=${start.callId}`,
);
await sleep(delayMs);
}
const spoken = (await client.request(
"voicecall.speak",
{
callId: start.callId,
message: params.message,
},
{ timeoutMs: params.config.voiceCall.requestTimeoutMs },
)) as VoiceCallSpeakResult;
if (spoken.success === false) {
throw new Error(spoken.error || "voicecall.speak failed");
}
introSent = true;
params.logger?.info(
`[google-meet] Intro speech requested after Meet dial sequence: callId=${start.callId}`,
);
}
return {
callId: start.callId,
dtmfSent: Boolean(params.dtmfSequence),
introSent: Boolean(params.message),
dtmfSent,
introSent,
};
} finally {
await client?.stopAndWait({ timeoutMs: 1_000 });

View File

@@ -6,6 +6,7 @@ import { createTestPluginApi } from "openclaw/plugin-sdk/plugin-test-api";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawPluginApi } from "./api.js";
import type { VoiceCallRuntime } from "./runtime-entry.js";
import type { CallRecord } from "./src/types.js";
let runtimeStub: VoiceCallRuntime;
@@ -52,8 +53,12 @@ function captureStdout() {
}
function createRuntimeStub(callId = "call-1"): VoiceCallRuntime {
const call = createCallRecord({ callId });
return {
config: { toNumber: "+15550001234" } as VoiceCallRuntime["config"],
config: {
toNumber: "+15550001234",
realtime: { enabled: false },
} as VoiceCallRuntime["config"],
provider: {} as VoiceCallRuntime["provider"],
manager: {
initiateCall: vi.fn(async () => ({ callId, success: true })),
@@ -64,17 +69,35 @@ function createRuntimeStub(callId = "call-1"): VoiceCallRuntime {
speak: vi.fn(async () => ({ success: true })),
sendDtmf: vi.fn(async () => ({ success: true })),
endCall: vi.fn(async () => ({ success: true })),
getCall: vi.fn((id: string) => (id === callId ? { callId } : undefined)),
getCall: vi.fn((id: string) => (id === callId ? call : undefined)),
getCallByProviderCallId: vi.fn(() => undefined),
getActiveCalls: vi.fn(() => [{ callId }]),
getActiveCalls: vi.fn(() => [call]),
getCallHistory: vi.fn(async () => []),
} as unknown as VoiceCallRuntime["manager"],
webhookServer: {} as VoiceCallRuntime["webhookServer"],
webhookServer: {
speakRealtime: vi.fn(() => ({ success: false, error: "No active realtime bridge for call" })),
} as unknown as VoiceCallRuntime["webhookServer"],
webhookUrl: "http://127.0.0.1:3334/voice/webhook",
publicUrl: null,
stop: vi.fn(async () => {}),
};
}
function createCallRecord(overrides: Partial<CallRecord> = {}): CallRecord {
return {
callId: "call-1",
provider: "mock",
direction: "outbound",
state: "active",
from: "+15550001111",
to: "+15550001234",
startedAt: Date.UTC(2026, 4, 2, 9, 0, 0),
transcript: [],
processedEventIds: [],
...overrides,
};
}
function createServiceContext(): Parameters<NonNullable<Registered["service"]>["start"]>[0] {
return {
config: {},
@@ -397,6 +420,60 @@ describe("voice-call plugin", () => {
expect(respond.mock.calls[0]).toEqual([true, { success: true }]);
});
it("normalizes provider call ids before speaking", async () => {
runtimeStub.manager.getCall = vi.fn(() => undefined);
runtimeStub.manager.getCallByProviderCallId = vi.fn(() =>
createCallRecord({
callId: "call-1",
providerCallId: "CA123",
}),
);
const { methods } = setup({ provider: "mock" });
const handler = methods.get("voicecall.speak") as
| ((ctx: {
params: Record<string, unknown>;
respond: ReturnType<typeof vi.fn>;
}) => Promise<void>)
| undefined;
const respond = vi.fn();
await handler?.({ params: { callId: "CA123", message: "hello" }, respond });
expect(runtimeStub.manager.speak).toHaveBeenCalledWith("call-1", "hello");
expect(respond.mock.calls[0]).toEqual([true, { success: true }]);
});
it("reports ended call history when speaking to a stale call", async () => {
runtimeStub.manager.getCall = vi.fn(() => undefined);
runtimeStub.manager.getCallByProviderCallId = vi.fn(() => undefined);
runtimeStub.manager.getCallHistory = vi.fn(async () => [
createCallRecord({
callId: "call-1",
providerCallId: "CA123",
state: "completed",
endReason: "completed",
endedAt: Date.UTC(2026, 4, 2, 9, 18, 23),
}),
]);
const { methods } = setup({ provider: "mock" });
const handler = methods.get("voicecall.speak") as
| ((ctx: {
params: Record<string, unknown>;
respond: ReturnType<typeof vi.fn>;
}) => Promise<void>)
| undefined;
const respond = vi.fn();
await handler?.({ params: { callId: "CA123", message: "hello" }, respond });
const [ok, , error] = respond.mock.calls[0] ?? [];
expect(ok).toBe(false);
expect(error.message).toContain("call is not active");
expect(error.message).toContain("last state=completed");
expect(error.message).toContain("endReason=completed");
expect(runtimeStub.manager.speak).not.toHaveBeenCalled();
});
it("normalizes legacy config through runtime creation and warns to run doctor", async () => {
const { methods } = setup({
enabled: true,

View File

@@ -302,6 +302,22 @@ export default definePluginEntry({
respondError(respond, formatErrorMessage(err));
};
const describeHistoricalCall = async (rt: VoiceCallRuntime, callId: string) => {
const history = await rt.manager.getCallHistory(100);
const call = history
.toReversed()
.find((candidate) => candidate.callId === callId || candidate.providerCallId === callId);
if (!call) {
return undefined;
}
const details = [
`last state=${call.state}`,
call.endReason ? `endReason=${call.endReason}` : undefined,
call.endedAt ? `endedAt=${new Date(call.endedAt).toISOString()}` : undefined,
].filter(Boolean);
return `call is not active (${details.join(", ")})`;
};
const resolveCallMessageRequest = async (params: GatewayRequestHandlerOptions["params"]) => {
const callId = normalizeOptionalString(params?.callId) ?? "";
const message = normalizeOptionalString(params?.message) ?? "";
@@ -309,7 +325,11 @@ export default definePluginEntry({
return { error: "callId and message required" } as const;
}
const rt = await ensureRuntime();
return { rt, callId, message } as const;
const activeCall = rt.manager.getCall(callId) ?? rt.manager.getCallByProviderCallId(callId);
if (activeCall) {
return { rt, callId: activeCall.callId, message } as const;
}
return { error: (await describeHistoricalCall(rt, callId)) ?? "Call not found" } as const;
};
const initiateCallAndRespond = async (params: {