fix: improve google meet twilio join sequencing

2026-05-06 05:40:44 +00:00 · 2026-05-02 10:56:08 +01:00
parent 59fb9e5ca7
commit 1634f91a35
7 changed files with 204 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,8 @@ Docs: https://docs.openclaw.ai
 - Gateway/pricing: abort in-flight model pricing catalog fetches when Gateway shutdown stops the refresh loop, and avoid post-stop cache writes or refresh timers. Fixes #72208. Thanks @rzcq.
 - Codex/app-server: make startup retry cleanup ownership-aware so concurrent Codex lanes cannot close another lane's freshly restarted shared app-server client. Thanks @vincentkoc.
 - Google Meet/Twilio: report missing dial-in details during setup and explain that Twilio cannot join Meet URLs without a phone dial plan.
+- Google Meet/Twilio: start the phone leg before sending Meet PIN DTMF, delay intro speech until after the post-connect dial sequence, and log each stage so operators can tell Twilio-leg audio from Meet-room audio.
+- Voice Call: accept provider call IDs for gateway speak/continue requests and report ended-call state from history instead of returning a generic "Call not found" for stale calls.
 - Control UI/Talk: allow the OpenAI Realtime WebRTC offer endpoint through the Control UI CSP, configure browser sessions with explicit VAD/transcription input settings, and surface OpenAI realtime error/lifecycle events instead of leaving Talk stuck as live with no diagnostic. Fixes #73427.
 - Plugins: clarify config-selected duplicate plugin override diagnostics and document manifest schema updates for bundled-plugin forks. Fixes #8582. Thanks @sachah.
 - CLI backends/Claude: make live-session JSONL turn caps bounded and configurable via `reliability.outputLimits`, raising the default guard for tool-heavy Claude CLI turns while preserving memory limits. Fixes #75838. Thanks @hcordoba840.
--- a/docs/plugins/google-meet.md
+++ b/docs/plugins/google-meet.md
@@ -1548,19 +1548,21 @@ participant:
 - Run `openclaw voicecall tail` and check that Twilio webhooks are arriving at
  the Gateway.
 - Run `openclaw logs --follow` and look for the Twilio Meet sequence: Google
-  Meet delegates the join, Voice Call stores pre-connect DTMF TwiML, serves
-  that initial TwiML, then serves realtime TwiML and starts the realtime bridge
-  with `initialGreeting=queued`.
+  Meet delegates the join, Voice Call starts the phone leg, Google Meet waits
+  `voiceCall.dtmfDelayMs`, sends DTMF with `voicecall.dtmf`, waits
+  `voiceCall.postDtmfSpeechDelayMs`, then requests intro speech with
+  `voicecall.speak`.
 - Re-run `openclaw googlemeet setup --transport twilio`; a green setup check is
  required but does not prove the meeting PIN sequence is correct.
 - Confirm the dial-in number belongs to the same Meet invitation and region as
  the PIN.
- Increase the leading pauses in `--dtmf-sequence` if Meet answers slowly, for
-  example `wwww123456#`.
+- Increase `voiceCall.dtmfDelayMs` if Meet answers slowly or the call transcript
+  still shows the prompt asking for a PIN after DTMF was sent.
 - If the participant joins but you do not hear the greeting, check
-  `openclaw logs --follow` for realtime TwiML, realtime bridge startup, and
-  `initialGreeting=queued`. The greeting is generated from the initial
-  `voicecall.start` message after the realtime bridge connects.
+  `openclaw logs --follow` for the post-DTMF `voicecall.speak` request and
+  either media-stream TTS playback or the Twilio `<Say>` fallback. If the call
+  transcript still contains "enter the meeting PIN", the phone leg has not joined
+  the Meet room yet, so meeting participants will not hear speech.

 If webhooks do not arrive, debug the Voice Call plugin first: the provider must
 reach `plugins.entries.voice-call.config.publicUrl` or the configured tunnel.
--- a/extensions/google-meet/src/runtime.ts
+++ b/extensions/google-meet/src/runtime.ts
@@ -491,7 +491,7 @@ export class GoogleMeetRuntime {
        session.notes.push(
          this.params.config.voiceCall.enabled
            ? dtmfSequence
-              ? "Twilio transport delegated the call to the voice-call plugin and queued configured DTMF."
+              ? "Twilio transport delegated the phone leg to the voice-call plugin, then sent configured DTMF after connect before speaking."
              : "Twilio transport delegated the call to the voice-call plugin without configured DTMF."
            : "Twilio transport is an explicit dial plan; voice-call delegation is disabled.",
        );
--- a/extensions/google-meet/src/voice-call-gateway.test.ts
+++ b/extensions/google-meet/src/voice-call-gateway.test.ts
@@ -21,39 +21,59 @@ vi.mock("openclaw/plugin-sdk/gateway-runtime", () => ({

 describe("Google Meet voice-call gateway", () => {
  beforeEach(() => {
+    vi.useRealTimers();
    gatewayMocks.request.mockReset();
    gatewayMocks.request.mockResolvedValue({ callId: "call-1" });
    gatewayMocks.stopAndWait.mockClear();
    gatewayMocks.startGatewayClientWhenEventLoopReady.mockClear();
  });

-  it("starts Twilio Meet calls with pre-connect DTMF and intro metadata", async () => {
+  it("starts Twilio Meet calls, sends delayed DTMF, then speaks the intro", async () => {
    const config = resolveGoogleMeetConfig({
      voiceCall: {
        gatewayUrl: "ws://127.0.0.1:18789",
        dtmfDelayMs: 1,
+        postDtmfSpeechDelayMs: 2,
      },
      realtime: { introMessage: "Say exactly: I'm here and listening." },
    });

-    await joinMeetViaVoiceCallGateway({
+    const join = joinMeetViaVoiceCallGateway({
      config,
      dialInNumber: "+15551234567",
      dtmfSequence: "123456#",
      message: "Say exactly: I'm here and listening.",
    });

+    await join;
+
    expect(gatewayMocks.request).toHaveBeenNthCalledWith(
      1,
      "voicecall.start",
      {
        to: "+15551234567",
        mode: "conversation",
-        message: "Say exactly: I'm here and listening.",
-        dtmfSequence: "123456#",
      },
      { timeoutMs: 30_000 },
    );
-    expect(gatewayMocks.request).toHaveBeenCalledTimes(1);
+    expect(gatewayMocks.request).toHaveBeenNthCalledWith(
+      2,
+      "voicecall.dtmf",
+      {
+        callId: "call-1",
+        digits: "123456#",
+      },
+      { timeoutMs: 30_000 },
+    );
+    expect(gatewayMocks.request).toHaveBeenNthCalledWith(
+      3,
+      "voicecall.speak",
+      {
+        callId: "call-1",
+        message: "Say exactly: I'm here and listening.",
+      },
+      { timeoutMs: 30_000 },
+    );
+    expect(gatewayMocks.request).toHaveBeenCalledTimes(3);
  });
 });
--- a/extensions/google-meet/src/voice-call-gateway.ts
+++ b/extensions/google-meet/src/voice-call-gateway.ts
@@ -18,12 +18,24 @@ type VoiceCallSpeakResult = {
  error?: string;
 };

+type VoiceCallDtmfResult = {
+  success?: boolean;
+  error?: string;
+};
+
 type VoiceCallMeetJoinResult = {
  callId: string;
  dtmfSent: boolean;
  introSent: boolean;
 };

+function sleep(ms: number): Promise<void> {
+  if (ms <= 0) {
+    return Promise.resolve();
+  }
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
 async function createConnectedGatewayClient(
  config: GoogleMeetConfig,
 ): Promise<VoiceCallGatewayClient> {
@@ -81,15 +93,13 @@ export async function joinMeetViaVoiceCallGateway(params: {
  try {
    client = await createConnectedGatewayClient(params.config);
    params.logger?.info(
-      `[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "yes" : "no"}, intro=${params.message ? "yes" : "no"})`,
+      `[google-meet] Delegating Twilio join to Voice Call (dtmf=${params.dtmfSequence ? "post-connect" : "none"}, intro=${params.message ? "delayed" : "none"})`,
    );
    const start = (await client.request(
      "voicecall.start",
      {
        to: params.dialInNumber,
        mode: "conversation",
-        ...(params.message ? { message: params.message } : {}),
-        ...(params.dtmfSequence ? { dtmfSequence: params.dtmfSequence } : {}),
      },
      { timeoutMs: params.config.voiceCall.requestTimeoutMs },
    )) as VoiceCallStartResult;
@@ -97,12 +107,60 @@ export async function joinMeetViaVoiceCallGateway(params: {
      throw new Error(start.error || "voicecall.start did not return callId");
    }
    params.logger?.info(
-      `[google-meet] Voice Call Twilio join started: callId=${start.callId} dtmf=${params.dtmfSequence ? "yes" : "no"} intro=${params.message ? "yes" : "no"}`,
+      `[google-meet] Voice Call Twilio phone leg started: callId=${start.callId}`,
    );
+    let dtmfSent = false;
+    if (params.dtmfSequence) {
+      const delayMs = params.config.voiceCall.dtmfDelayMs;
+      params.logger?.info(
+        `[google-meet] Waiting ${delayMs}ms before sending Meet DTMF for callId=${start.callId}`,
+      );
+      await sleep(delayMs);
+      const dtmf = (await client.request(
+        "voicecall.dtmf",
+        {
+          callId: start.callId,
+          digits: params.dtmfSequence,
+        },
+        { timeoutMs: params.config.voiceCall.requestTimeoutMs },
+      )) as VoiceCallDtmfResult;
+      if (dtmf.success === false) {
+        throw new Error(dtmf.error || "voicecall.dtmf failed");
+      }
+      dtmfSent = true;
+      params.logger?.info(
+        `[google-meet] Meet DTMF sent after phone leg connected: callId=${start.callId} digits=${params.dtmfSequence.length}`,
+      );
+    }
+    let introSent = false;
+    if (params.message) {
+      const delayMs = params.dtmfSequence ? params.config.voiceCall.postDtmfSpeechDelayMs : 0;
+      if (delayMs > 0) {
+        params.logger?.info(
+          `[google-meet] Waiting ${delayMs}ms after Meet DTMF before speaking intro for callId=${start.callId}`,
+        );
+        await sleep(delayMs);
+      }
+      const spoken = (await client.request(
+        "voicecall.speak",
+        {
+          callId: start.callId,
+          message: params.message,
+        },
+        { timeoutMs: params.config.voiceCall.requestTimeoutMs },
+      )) as VoiceCallSpeakResult;
+      if (spoken.success === false) {
+        throw new Error(spoken.error || "voicecall.speak failed");
+      }
+      introSent = true;
+      params.logger?.info(
+        `[google-meet] Intro speech requested after Meet dial sequence: callId=${start.callId}`,
+      );
+    }
    return {
      callId: start.callId,
-      dtmfSent: Boolean(params.dtmfSequence),
-      introSent: Boolean(params.message),
+      dtmfSent,
+      introSent,
    };
  } finally {
    await client?.stopAndWait({ timeoutMs: 1_000 });
--- a/extensions/voice-call/index.test.ts
+++ b/extensions/voice-call/index.test.ts
@@ -6,6 +6,7 @@ import { createTestPluginApi } from "openclaw/plugin-sdk/plugin-test-api";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawPluginApi } from "./api.js";
 import type { VoiceCallRuntime } from "./runtime-entry.js";
+import type { CallRecord } from "./src/types.js";

 let runtimeStub: VoiceCallRuntime;

@@ -52,8 +53,12 @@ function captureStdout() {
 }

 function createRuntimeStub(callId = "call-1"): VoiceCallRuntime {
+  const call = createCallRecord({ callId });
  return {
-    config: { toNumber: "+15550001234" } as VoiceCallRuntime["config"],
+    config: {
+      toNumber: "+15550001234",
+      realtime: { enabled: false },
+    } as VoiceCallRuntime["config"],
    provider: {} as VoiceCallRuntime["provider"],
    manager: {
      initiateCall: vi.fn(async () => ({ callId, success: true })),
@@ -64,17 +69,35 @@ function createRuntimeStub(callId = "call-1"): VoiceCallRuntime {
      speak: vi.fn(async () => ({ success: true })),
      sendDtmf: vi.fn(async () => ({ success: true })),
      endCall: vi.fn(async () => ({ success: true })),
-      getCall: vi.fn((id: string) => (id === callId ? { callId } : undefined)),
+      getCall: vi.fn((id: string) => (id === callId ? call : undefined)),
      getCallByProviderCallId: vi.fn(() => undefined),
-      getActiveCalls: vi.fn(() => [{ callId }]),
+      getActiveCalls: vi.fn(() => [call]),
+      getCallHistory: vi.fn(async () => []),
    } as unknown as VoiceCallRuntime["manager"],
-    webhookServer: {} as VoiceCallRuntime["webhookServer"],
+    webhookServer: {
+      speakRealtime: vi.fn(() => ({ success: false, error: "No active realtime bridge for call" })),
+    } as unknown as VoiceCallRuntime["webhookServer"],
    webhookUrl: "http://127.0.0.1:3334/voice/webhook",
    publicUrl: null,
    stop: vi.fn(async () => {}),
  };
 }

+function createCallRecord(overrides: Partial<CallRecord> = {}): CallRecord {
+  return {
+    callId: "call-1",
+    provider: "mock",
+    direction: "outbound",
+    state: "active",
+    from: "+15550001111",
+    to: "+15550001234",
+    startedAt: Date.UTC(2026, 4, 2, 9, 0, 0),
+    transcript: [],
+    processedEventIds: [],
+    ...overrides,
+  };
+}
+
 function createServiceContext(): Parameters<NonNullable<Registered["service"]>["start"]>[0] {
  return {
    config: {},
@@ -397,6 +420,60 @@ describe("voice-call plugin", () => {
    expect(respond.mock.calls[0]).toEqual([true, { success: true }]);
  });

+  it("normalizes provider call ids before speaking", async () => {
+    runtimeStub.manager.getCall = vi.fn(() => undefined);
+    runtimeStub.manager.getCallByProviderCallId = vi.fn(() =>
+      createCallRecord({
+        callId: "call-1",
+        providerCallId: "CA123",
+      }),
+    );
+    const { methods } = setup({ provider: "mock" });
+    const handler = methods.get("voicecall.speak") as
+      | ((ctx: {
+          params: Record<string, unknown>;
+          respond: ReturnType<typeof vi.fn>;
+        }) => Promise<void>)
+      | undefined;
+    const respond = vi.fn();
+
+    await handler?.({ params: { callId: "CA123", message: "hello" }, respond });
+
+    expect(runtimeStub.manager.speak).toHaveBeenCalledWith("call-1", "hello");
+    expect(respond.mock.calls[0]).toEqual([true, { success: true }]);
+  });
+
+  it("reports ended call history when speaking to a stale call", async () => {
+    runtimeStub.manager.getCall = vi.fn(() => undefined);
+    runtimeStub.manager.getCallByProviderCallId = vi.fn(() => undefined);
+    runtimeStub.manager.getCallHistory = vi.fn(async () => [
+      createCallRecord({
+        callId: "call-1",
+        providerCallId: "CA123",
+        state: "completed",
+        endReason: "completed",
+        endedAt: Date.UTC(2026, 4, 2, 9, 18, 23),
+      }),
+    ]);
+    const { methods } = setup({ provider: "mock" });
+    const handler = methods.get("voicecall.speak") as
+      | ((ctx: {
+          params: Record<string, unknown>;
+          respond: ReturnType<typeof vi.fn>;
+        }) => Promise<void>)
+      | undefined;
+    const respond = vi.fn();
+
+    await handler?.({ params: { callId: "CA123", message: "hello" }, respond });
+
+    const [ok, , error] = respond.mock.calls[0] ?? [];
+    expect(ok).toBe(false);
+    expect(error.message).toContain("call is not active");
+    expect(error.message).toContain("last state=completed");
+    expect(error.message).toContain("endReason=completed");
+    expect(runtimeStub.manager.speak).not.toHaveBeenCalled();
+  });
+
  it("normalizes legacy config through runtime creation and warns to run doctor", async () => {
    const { methods } = setup({
      enabled: true,
--- a/extensions/voice-call/index.ts
+++ b/extensions/voice-call/index.ts
@@ -302,6 +302,22 @@ export default definePluginEntry({
      respondError(respond, formatErrorMessage(err));
    };

+    const describeHistoricalCall = async (rt: VoiceCallRuntime, callId: string) => {
+      const history = await rt.manager.getCallHistory(100);
+      const call = history
+        .toReversed()
+        .find((candidate) => candidate.callId === callId || candidate.providerCallId === callId);
+      if (!call) {
+        return undefined;
+      }
+      const details = [
+        `last state=${call.state}`,
+        call.endReason ? `endReason=${call.endReason}` : undefined,
+        call.endedAt ? `endedAt=${new Date(call.endedAt).toISOString()}` : undefined,
+      ].filter(Boolean);
+      return `call is not active (${details.join(", ")})`;
+    };
+
    const resolveCallMessageRequest = async (params: GatewayRequestHandlerOptions["params"]) => {
      const callId = normalizeOptionalString(params?.callId) ?? "";
      const message = normalizeOptionalString(params?.message) ?? "";
@@ -309,7 +325,11 @@ export default definePluginEntry({
        return { error: "callId and message required" } as const;
      }
      const rt = await ensureRuntime();
-      return { rt, callId, message } as const;
+      const activeCall = rt.manager.getCall(callId) ?? rt.manager.getCallByProviderCallId(callId);
+      if (activeCall) {
+        return { rt, callId: activeCall.callId, message } as const;
+      }
+      return { error: (await describeHistoricalCall(rt, callId)) ?? "Call not found" } as const;
    };

    const initiateCallAndRespond = async (params: {