fix(voice-call): pace realtime Twilio audio

2026-05-06 07:40:44 +00:00 · 2026-05-03 19:59:20 -07:00
parent 19f948af2e
commit 7fc9a82dca
8 changed files with 431 additions and 45 deletions
--- a/docs/plugins/google-meet.md
+++ b/docs/plugins/google-meet.md
@@ -445,7 +445,7 @@ Enable the Voice Call plugin on the Gateway host, not on the Chrome node:
 ```json5
 {
  plugins: {
-    allow: ["google-meet", "voice-call"],
+    allow: ["google-meet", "voice-call", "google"],
    entries: {
      "google-meet": {
        enabled: true,
@@ -458,8 +458,24 @@ Enable the Voice Call plugin on the Gateway host, not on the Chrome node:
        enabled: true,
        config: {
          provider: "twilio",
+          inboundPolicy: "allowlist",
+          realtime: {
+            enabled: true,
+            provider: "google",
+            instructions: "Join this Google Meet as an OpenClaw agent. Be brief.",
+            toolPolicy: "safe-read-only",
+            providers: {
+              google: {
+                silenceDurationMs: 500,
+                startSensitivity: "high",
+              },
+            },
+          },
        },
      },
+      google: {
+        enabled: true,
+      },
    },
  },
 }
@@ -472,8 +488,12 @@ secrets out of `openclaw.json`:
 export TWILIO_ACCOUNT_SID=AC...
 export TWILIO_AUTH_TOKEN=...
 export TWILIO_FROM_NUMBER=+15550001234
+export GEMINI_API_KEY=...
 ```

+Use `realtime.provider: "openai"` with the OpenAI provider plugin and
+`OPENAI_API_KEY` instead if that is your realtime voice provider.
+
 Restart or reload the Gateway after enabling `voice-call`; plugin config changes
 do not appear in an already running Gateway process until it reloads.

--- a/docs/plugins/voice-call.md
+++ b/docs/plugins/voice-call.md
@@ -250,6 +250,9 @@ Current runtime behaviour:
    Defaults: API key from `realtime.providers.google.apiKey`,
    `GEMINI_API_KEY`, or `GOOGLE_GENERATIVE_AI_API_KEY`; model
    `gemini-2.5-flash-native-audio-preview-12-2025`; voice `Kore`.
+    `sessionResumption` and `contextWindowCompression` default on for longer,
+    reconnectable calls. Use `silenceDurationMs`, `startSensitivity`, and
+    `endSensitivity` to tune faster turn-taking on telephony audio.

    ```json5
    {
@@ -270,6 +273,8 @@ Current runtime behaviour:
                    apiKey: "${GEMINI_API_KEY}",
                    model: "gemini-2.5-flash-native-audio-preview-12-2025",
                    voice: "Kore",
+                    silenceDurationMs: 500,
+                    startSensitivity: "high",
                  },
                },
              },
--- a/extensions/google/realtime-voice-provider.test.ts
+++ b/extensions/google/realtime-voice-provider.test.ts
@@ -107,6 +107,8 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
      turnCoverage: "only-activity",
      automaticActivityDetectionDisabled: false,
      enableAffectiveDialog: undefined,
+      sessionResumption: undefined,
+      contextWindowCompression: undefined,
      thinkingLevel: undefined,
      thinkingBudget: undefined,
    });
@@ -181,6 +183,8 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
          },
          turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY",
        },
+        sessionResumption: {},
+        contextWindowCompression: { slidingWindow: {} },
        tools: [
          {
            functionDeclarations: [
@@ -312,6 +316,42 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
    });
  });

+  it("can opt out of Google Live session resumption and context compression", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: {
+        apiKey: "gemini-key",
+        contextWindowCompression: false,
+        sessionResumption: false,
+      },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+
+    expect(lastConnectParams().config).not.toHaveProperty("contextWindowCompression");
+    expect(lastConnectParams().config).not.toHaveProperty("sessionResumption");
+  });
+
+  it("captures Google Live resumption handles and reuses them on reconnect", async () => {
+    const provider = buildGoogleRealtimeVoiceProvider();
+    const bridge = provider.createBridge({
+      providerConfig: { apiKey: "gemini-key" },
+      onAudio: vi.fn(),
+      onClearAudio: vi.fn(),
+    });
+
+    await bridge.connect();
+    lastConnectParams().callbacks.onmessage({
+      sessionResumptionUpdate: { resumable: true, newHandle: "resume-1" },
+    });
+
+    await bridge.connect();
+
+    expect(lastConnectParams().config.sessionResumption).toEqual({ handle: "resume-1" });
+  });
+
  it("waits for setup completion before draining audio and firing ready", async () => {
    const provider = buildGoogleRealtimeVoiceProvider();
    const onReady = vi.fn();
--- a/extensions/google/realtime-voice-provider.ts
+++ b/extensions/google/realtime-voice-provider.ts
@@ -1,20 +1,20 @@
 import { randomUUID } from "node:crypto";
-import {
+import type {
  ActivityHandling,
  Behavior,
  EndSensitivity,
+  FunctionDeclaration,
+  FunctionResponse,
  FunctionResponseScheduling,
+  LiveConnectConfig,
+  LiveServerContent,
+  LiveServerMessage,
+  LiveServerToolCall,
  Modality,
+  RealtimeInputConfig,
  StartSensitivity,
+  ThinkingConfig,
  TurnCoverage,
-  type FunctionDeclaration,
-  type FunctionResponse,
-  type LiveConnectConfig,
-  type LiveServerContent,
-  type LiveServerMessage,
-  type LiveServerToolCall,
-  type RealtimeInputConfig,
-  type ThinkingConfig,
 } from "@google/genai";
 import type { OpenClawConfig } from "openclaw/plugin-sdk/provider-onboard";
 import type {
@@ -47,7 +47,7 @@ const GOOGLE_REALTIME_BROWSER_API_VERSION = "v1alpha";
 const GOOGLE_REALTIME_BROWSER_WEBSOCKET_URL =
  "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
 const MAX_PENDING_AUDIO_CHUNKS = 320;
-const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 700;
+const DEFAULT_AUDIO_STREAM_END_SILENCE_MS = 500;
 const GOOGLE_REALTIME_BROWSER_SESSION_TTL_MS = 30 * 60 * 1000;
 const GOOGLE_REALTIME_BROWSER_NEW_SESSION_TTL_MS = 60 * 1000;

@@ -70,6 +70,8 @@ type GoogleRealtimeVoiceProviderConfig = {
  turnCoverage?: GoogleRealtimeTurnCoverage;
  automaticActivityDetectionDisabled?: boolean;
  enableAffectiveDialog?: boolean;
+  sessionResumption?: boolean;
+  contextWindowCompression?: boolean;
  thinkingLevel?: GoogleRealtimeThinkingLevel;
  thinkingBudget?: number;
 };
@@ -90,6 +92,8 @@ type GoogleRealtimeLiveConfig = {
  turnCoverage?: GoogleRealtimeTurnCoverage;
  automaticActivityDetectionDisabled?: boolean;
  enableAffectiveDialog?: boolean;
+  sessionResumption?: boolean;
+  contextWindowCompression?: boolean;
  thinkingLevel?: GoogleRealtimeThinkingLevel;
  thinkingBudget?: number;
 };
@@ -209,6 +213,8 @@ function normalizeProviderConfig(
    turnCoverage: asTurnCoverage(raw?.turnCoverage),
    automaticActivityDetectionDisabled: asBoolean(raw?.automaticActivityDetectionDisabled),
    enableAffectiveDialog: asBoolean(raw?.enableAffectiveDialog),
+    sessionResumption: asBoolean(raw?.sessionResumption),
+    contextWindowCompression: asBoolean(raw?.contextWindowCompression),
    thinkingLevel: asThinkingLevel(raw?.thinkingLevel),
    thinkingBudget: asFiniteNumber(raw?.thinkingBudget),
  };
@@ -223,9 +229,9 @@ function mapStartSensitivity(
 ): StartSensitivity | undefined {
  switch (value) {
    case "high":
-      return StartSensitivity.START_SENSITIVITY_HIGH;
+      return "START_SENSITIVITY_HIGH" as StartSensitivity;
    case "low":
-      return StartSensitivity.START_SENSITIVITY_LOW;
+      return "START_SENSITIVITY_LOW" as StartSensitivity;
    default:
      return undefined;
  }
@@ -236,9 +242,9 @@ function mapEndSensitivity(
 ): EndSensitivity | undefined {
  switch (value) {
    case "high":
-      return EndSensitivity.END_SENSITIVITY_HIGH;
+      return "END_SENSITIVITY_HIGH" as EndSensitivity;
    case "low":
-      return EndSensitivity.END_SENSITIVITY_LOW;
+      return "END_SENSITIVITY_LOW" as EndSensitivity;
    default:
      return undefined;
  }
@@ -249,9 +255,9 @@ function mapActivityHandling(
 ): ActivityHandling | undefined {
  switch (value) {
    case "no-interruption":
-      return ActivityHandling.NO_INTERRUPTION;
+      return "NO_INTERRUPTION" as ActivityHandling;
    case "start-of-activity-interrupts":
-      return ActivityHandling.START_OF_ACTIVITY_INTERRUPTS;
+      return "START_OF_ACTIVITY_INTERRUPTS" as ActivityHandling;
    default:
      return undefined;
  }
@@ -260,11 +266,11 @@ function mapActivityHandling(
 function mapTurnCoverage(value: GoogleRealtimeTurnCoverage | undefined): TurnCoverage | undefined {
  switch (value) {
    case "only-activity":
-      return TurnCoverage.TURN_INCLUDES_ONLY_ACTIVITY;
+      return "TURN_INCLUDES_ONLY_ACTIVITY" as TurnCoverage;
    case "all-input":
-      return TurnCoverage.TURN_INCLUDES_ALL_INPUT;
+      return "TURN_INCLUDES_ALL_INPUT" as TurnCoverage;
    case "audio-activity-and-all-video":
-      return TurnCoverage.TURN_INCLUDES_AUDIO_ACTIVITY_AND_ALL_VIDEO;
+      return "TURN_INCLUDES_AUDIO_ACTIVITY_AND_ALL_VIDEO" as TurnCoverage;
    default:
      return undefined;
  }
@@ -316,7 +322,7 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
      parametersJsonSchema: tool.parameters,
    };
    if (tool.name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME) {
-      declaration.behavior = Behavior.NON_BLOCKING;
+      declaration.behavior = "NON_BLOCKING" as Behavior;
    }
    return declaration;
  });
@@ -325,7 +331,7 @@ function buildFunctionDeclarations(tools: RealtimeVoiceTool[] | undefined): Func
 function buildGoogleLiveConnectConfig(config: GoogleRealtimeLiveConfig): LiveConnectConfig {
  const functionDeclarations = buildFunctionDeclarations(config.tools);
  return {
-    responseModalities: [Modality.AUDIO],
+    responseModalities: ["AUDIO" as Modality],
    ...(typeof config.temperature === "number" && config.temperature > 0
      ? { temperature: config.temperature }
      : {}),
@@ -359,7 +365,7 @@ function buildBrowserInitialSetup(model: string) {
    setup: {
      model: toGoogleModelResource(model),
      generationConfig: {
-        responseModalities: [Modality.AUDIO],
+        responseModalities: ["AUDIO" as Modality],
      },
      inputAudioTranscription: {},
      outputAudioTranscription: {},
@@ -403,6 +409,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
  private audioStreamEnded = false;
  private pendingFunctionNames = new Map<string, string>();
  private readonly audioFormat: RealtimeVoiceAudioFormat;
+  private resumptionHandle: string | undefined;

  constructor(private readonly config: GoogleRealtimeVoiceBridgeConfig) {
    this.audioFormat = config.audioFormat ?? REALTIME_VOICE_AUDIO_FORMAT_G711_ULAW_8KHZ;
@@ -425,7 +432,17 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {

    this.session = (await ai.live.connect({
      model: this.config.model ?? GOOGLE_REALTIME_DEFAULT_MODEL,
-      config: buildGoogleLiveConnectConfig(this.config),
+      config: {
+        ...buildGoogleLiveConnectConfig(this.config),
+        ...(this.config.sessionResumption === false
+          ? {}
+          : {
+              sessionResumption: this.resumptionHandle ? { handle: this.resumptionHandle } : {},
+            }),
+        ...(this.config.contextWindowCompression === false
+          ? {}
+          : { contextWindowCompression: { slidingWindow: {} } }),
+      },
      callbacks: {
        onopen: () => {
          this.connected = true;
@@ -548,7 +565,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
            : { output: result },
      };
      if (isConsultTool) {
-        functionResponse.scheduling = FunctionResponseScheduling.WHEN_IDLE;
+        functionResponse.scheduling = "WHEN_IDLE" as FunctionResponseScheduling;
        if (options?.willContinue === true) {
          functionResponse.willContinue = true;
        }
@@ -607,6 +624,7 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
  }

  private handleMessage(message: LiveServerMessage): void {
+    this.captureSessionLifecycle(message);
    if (message.setupComplete) {
      this.handleSetupComplete();
    }
@@ -618,6 +636,20 @@ class GoogleRealtimeVoiceBridge implements RealtimeVoiceBridge {
    }
  }

+  private captureSessionLifecycle(message: LiveServerMessage): void {
+    const raw = message as unknown as {
+      goAway?: { timeLeft?: string };
+      sessionResumptionUpdate?: { newHandle?: string; resumable?: boolean };
+    };
+    const update = raw.sessionResumptionUpdate;
+    if (update?.resumable && update.newHandle) {
+      this.resumptionHandle = update.newHandle;
+    }
+    if (raw.goAway?.timeLeft) {
+      this.config.onError?.(new Error(`Google Live session goAway: ${raw.goAway.timeLeft}`));
+    }
+  }
+
  private handleSetupComplete(): void {
    this.sessionConfigured = true;
    for (const chunk of this.pendingAudio.splice(0)) {
@@ -784,6 +816,8 @@ export function buildGoogleRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin
        turnCoverage: config.turnCoverage,
        automaticActivityDetectionDisabled: config.automaticActivityDetectionDisabled,
        enableAffectiveDialog: config.enableAffectiveDialog,
+        sessionResumption: config.sessionResumption,
+        contextWindowCompression: config.contextWindowCompression,
        thinkingLevel: config.thinkingLevel,
        thinkingBudget: config.thinkingBudget,
      });
--- a/extensions/voice-call/index.ts
+++ b/extensions/voice-call/index.ts
@@ -589,9 +589,9 @@ export default definePluginEntry({
            respondError(respond, "to required", ErrorCodes.INVALID_REQUEST);
            return;
          }
-          const rt = await ensureRuntime();
          const mode =
            params?.mode === "notify" || params?.mode === "conversation" ? params.mode : undefined;
+          const rt = await ensureRuntime();
          await initiateCallAndRespond({
            rt,
            respond,
--- a/extensions/voice-call/src/webhook/realtime-audio-pacer.test.ts
+++ b/extensions/voice-call/src/webhook/realtime-audio-pacer.test.ts
@@ -0,0 +1,86 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import {
+  RealtimeMulawSpeechStartDetector,
+  RealtimeTwilioAudioPacer,
+  calculateMulawRms,
+} from "./realtime-audio-pacer.js";
+
+describe("RealtimeTwilioAudioPacer", () => {
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  it("paces realtime audio as 20ms telephony frames before marks", async () => {
+    vi.useFakeTimers();
+    const sent: unknown[] = [];
+    const pacer = new RealtimeTwilioAudioPacer({
+      streamSid: "MZ-test",
+      sendJson: (message) => {
+        sent.push(message);
+        return true;
+      },
+    });
+
+    pacer.sendAudio(Buffer.alloc(320, 0x7f));
+    pacer.sendMark("audio-1");
+
+    expect(sent).toHaveLength(1);
+    expect(
+      Buffer.from((sent[0] as { media: { payload: string } }).media.payload, "base64"),
+    ).toHaveLength(160);
+
+    await vi.advanceTimersByTimeAsync(20);
+    expect(sent).toHaveLength(2);
+    expect(
+      Buffer.from((sent[1] as { media: { payload: string } }).media.payload, "base64"),
+    ).toHaveLength(160);
+
+    await vi.advanceTimersByTimeAsync(20);
+    expect(sent[2]).toEqual({
+      event: "mark",
+      streamSid: "MZ-test",
+      mark: { name: "audio-1" },
+    });
+  });
+
+  it("clears queued audio immediately", async () => {
+    vi.useFakeTimers();
+    const sent: unknown[] = [];
+    const pacer = new RealtimeTwilioAudioPacer({
+      streamSid: "MZ-test",
+      sendJson: (message) => {
+        sent.push(message);
+        return true;
+      },
+    });
+
+    pacer.sendAudio(Buffer.alloc(480, 0x7f));
+    pacer.clearAudio();
+    await vi.advanceTimersByTimeAsync(100);
+
+    expect(sent).toHaveLength(2);
+    expect(sent[1]).toEqual({ event: "clear", streamSid: "MZ-test" });
+  });
+});
+
+describe("RealtimeMulawSpeechStartDetector", () => {
+  it("detects a speech start after consecutive loud chunks and resets after quiet", () => {
+    const detector = new RealtimeMulawSpeechStartDetector({
+      requiredLoudChunks: 2,
+      requiredQuietChunks: 2,
+      rmsThreshold: 0.02,
+    });
+    const silence = Buffer.alloc(160, 0xff);
+    const speech = Buffer.alloc(160, 0x00);
+
+    expect(calculateMulawRms(silence)).toBeLessThan(0.02);
+    expect(calculateMulawRms(speech)).toBeGreaterThan(0.02);
+    expect(detector.accept(speech)).toBe(false);
+    expect(detector.accept(speech)).toBe(true);
+    expect(detector.accept(speech)).toBe(false);
+    expect(detector.accept(silence)).toBe(false);
+    expect(detector.accept(silence)).toBe(false);
+    expect(detector.accept(speech)).toBe(false);
+    expect(detector.accept(speech)).toBe(true);
+  });
+});
--- a/extensions/voice-call/src/webhook/realtime-audio-pacer.ts
+++ b/extensions/voice-call/src/webhook/realtime-audio-pacer.ts
@@ -0,0 +1,176 @@
+import { mulawToPcm } from "openclaw/plugin-sdk/realtime-voice";
+
+const TELEPHONY_SAMPLE_RATE = 8_000;
+const TELEPHONY_CHUNK_BYTES = 160;
+const TELEPHONY_CHUNK_MS = 20;
+const DEFAULT_SPEECH_RMS_THRESHOLD = 0.02;
+const DEFAULT_REQUIRED_LOUD_CHUNKS = 2;
+const DEFAULT_REQUIRED_QUIET_CHUNKS = 10;
+
+type RealtimeTwilioAudioQueueItem =
+  | {
+      chunk: Buffer;
+      durationMs: number;
+      type: "audio";
+    }
+  | {
+      name: string;
+      type: "mark";
+    };
+
+export type RealtimeTwilioAudioPacerSendJson = (message: unknown) => boolean;
+
+export class RealtimeTwilioAudioPacer {
+  private queue: RealtimeTwilioAudioQueueItem[] = [];
+  private timer: ReturnType<typeof setTimeout> | null = null;
+  private closed = false;
+
+  constructor(
+    private readonly params: {
+      sendJson: RealtimeTwilioAudioPacerSendJson;
+      streamSid: string;
+    },
+  ) {}
+
+  sendAudio(muLaw: Buffer): void {
+    if (this.closed || muLaw.length === 0) {
+      return;
+    }
+    for (let offset = 0; offset < muLaw.length; offset += TELEPHONY_CHUNK_BYTES) {
+      const chunk = Buffer.from(muLaw.subarray(offset, offset + TELEPHONY_CHUNK_BYTES));
+      this.queue.push({
+        type: "audio",
+        chunk,
+        durationMs: Math.max(1, Math.round((chunk.length / TELEPHONY_SAMPLE_RATE) * 1000)),
+      });
+    }
+    this.ensurePump();
+  }
+
+  sendMark(name: string): void {
+    if (this.closed || !name) {
+      return;
+    }
+    this.queue.push({ type: "mark", name });
+    this.ensurePump();
+  }
+
+  clearAudio(): void {
+    if (this.closed) {
+      return;
+    }
+    this.clearTimer();
+    this.queue = [];
+    this.params.sendJson({ event: "clear", streamSid: this.params.streamSid });
+  }
+
+  close(): void {
+    this.closed = true;
+    this.clearTimer();
+    this.queue = [];
+  }
+
+  private clearTimer(): void {
+    if (!this.timer) {
+      return;
+    }
+    clearTimeout(this.timer);
+    this.timer = null;
+  }
+
+  private ensurePump(): void {
+    if (!this.timer) {
+      this.pump();
+    }
+  }
+
+  private pump(): void {
+    this.timer = null;
+    if (this.closed) {
+      return;
+    }
+    const item = this.queue.shift();
+    if (!item) {
+      return;
+    }
+
+    let delayMs = 0;
+    let sent = true;
+    if (item.type === "audio") {
+      sent = this.params.sendJson({
+        event: "media",
+        streamSid: this.params.streamSid,
+        media: { payload: item.chunk.toString("base64") },
+      });
+      delayMs = item.durationMs || TELEPHONY_CHUNK_MS;
+    } else {
+      sent = this.params.sendJson({
+        event: "mark",
+        streamSid: this.params.streamSid,
+        mark: { name: item.name },
+      });
+    }
+
+    if (!sent) {
+      this.queue = [];
+      return;
+    }
+    if (this.queue.length > 0) {
+      this.timer = setTimeout(() => this.pump(), delayMs);
+    }
+  }
+}
+
+export function calculateMulawRms(muLaw: Buffer): number {
+  if (muLaw.length === 0) {
+    return 0;
+  }
+  const pcm = mulawToPcm(muLaw);
+  const samples = Math.floor(pcm.length / 2);
+  if (samples === 0) {
+    return 0;
+  }
+  let sum = 0;
+  for (let i = 0; i < samples; i += 1) {
+    const normalized = pcm.readInt16LE(i * 2) / 32768;
+    sum += normalized * normalized;
+  }
+  return Math.sqrt(sum / samples);
+}
+
+export class RealtimeMulawSpeechStartDetector {
+  private loudChunks = 0;
+  private quietChunks = DEFAULT_REQUIRED_QUIET_CHUNKS;
+  private speaking = false;
+
+  constructor(
+    private readonly params: {
+      requiredLoudChunks?: number;
+      requiredQuietChunks?: number;
+      rmsThreshold?: number;
+    } = {},
+  ) {}
+
+  accept(muLaw: Buffer): boolean {
+    const rms = calculateMulawRms(muLaw);
+    const threshold = this.params.rmsThreshold ?? DEFAULT_SPEECH_RMS_THRESHOLD;
+    if (rms >= threshold) {
+      this.quietChunks = 0;
+      this.loudChunks += 1;
+      const requiredLoudChunks = this.params.requiredLoudChunks ?? DEFAULT_REQUIRED_LOUD_CHUNKS;
+      if (!this.speaking && this.loudChunks >= requiredLoudChunks) {
+        this.speaking = true;
+        return true;
+      }
+      return false;
+    }
+
+    this.loudChunks = 0;
+    this.quietChunks += 1;
+    const requiredQuietChunks = this.params.requiredQuietChunks ?? DEFAULT_REQUIRED_QUIET_CHUNKS;
+    if (this.quietChunks >= requiredQuietChunks) {
+      this.speaking = false;
+    }
+    return false;
+  }
+}
--- a/extensions/voice-call/src/webhook/realtime-handler.ts
+++ b/extensions/voice-call/src/webhook/realtime-handler.ts
@@ -16,6 +16,10 @@ import type { CallManager } from "../manager.js";
 import type { VoiceCallProvider } from "../providers/base.js";
 import type { CallRecord, NormalizedEvent } from "../types.js";
 import type { WebhookResponsePayload } from "../webhook.types.js";
+import {
+  RealtimeMulawSpeechStartDetector,
+  RealtimeTwilioAudioPacer,
+} from "./realtime-audio-pacer.js";

 export type ToolHandlerContext = {
  partialUserTranscript?: string;
@@ -29,6 +33,7 @@ export type ToolHandlerFn = (
 const STREAM_TOKEN_TTL_MS = 30_000;
 const DEFAULT_HOST = "localhost:8443";
 const MAX_REALTIME_MESSAGE_BYTES = 256 * 1024;
+const MAX_REALTIME_WS_BUFFERED_BYTES = 1024 * 1024;

 function normalizePath(pathname: string): string {
  const trimmed = pathname.trim();
@@ -179,7 +184,8 @@ export class RealtimeCallHandler {
              ? (msg.media as Record<string, unknown>)
              : undefined;
          if (msg.event === "media" && typeof mediaData?.payload === "string") {
-            bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
+            const audio = Buffer.from(mediaData.payload, "base64");
+            bridge.sendAudio(audio);
            if (typeof mediaData.timestamp === "number") {
              bridge.setMediaTimestamp(mediaData.timestamp);
            } else if (typeof mediaData.timestamp === "string") {
@@ -278,7 +284,24 @@ export class RealtimeCallHandler {
      this.endCallInManager(callSid, callId, reason);
    };

-    const bridge = createRealtimeVoiceBridgeSession({
+    const sendJson = (message: unknown): boolean => {
+      if (ws.readyState !== WebSocket.OPEN) {
+        return false;
+      }
+      if (ws.bufferedAmount > MAX_REALTIME_WS_BUFFERED_BYTES) {
+        ws.close(1013, "Backpressure: send buffer exceeded");
+        return false;
+      }
+      ws.send(JSON.stringify(message));
+      if (ws.bufferedAmount > MAX_REALTIME_WS_BUFFERED_BYTES) {
+        ws.close(1013, "Backpressure: send buffer exceeded");
+        return false;
+      }
+      return true;
+    };
+    const audioPacer = new RealtimeTwilioAudioPacer({ streamSid, sendJson });
+    const speechDetector = new RealtimeMulawSpeechStartDetector();
+    const session = createRealtimeVoiceBridgeSession({
      provider: this.realtimeProvider,
      providerConfig: this.providerConfig,
      instructions: this.config.instructions,
@@ -288,19 +311,13 @@ export class RealtimeCallHandler {
      audioSink: {
        isOpen: () => ws.readyState === WebSocket.OPEN,
        sendAudio: (muLaw) => {
-          ws.send(
-            JSON.stringify({
-              event: "media",
-              streamSid,
-              media: { payload: muLaw.toString("base64") },
-            }),
-          );
+          audioPacer.sendAudio(muLaw);
        },
        clearAudio: () => {
-          ws.send(JSON.stringify({ event: "clear", streamSid }));
+          audioPacer.clearAudio();
        },
        sendMark: (markName) => {
-          ws.send(JSON.stringify({ event: "mark", streamSid, mark: { name: markName } }));
+          audioPacer.sendMark(markName);
        },
      },
      onTranscript: (role, text, isFinal) => {
@@ -367,24 +384,32 @@ export class RealtimeCallHandler {
          });
      },
    });
-    this.activeBridgesByCallId.set(callId, bridge);
-    this.activeBridgesByCallId.set(callSid, bridge);
-    const closeBridge = bridge.close.bind(bridge);
-    bridge.close = () => {
+    this.activeBridgesByCallId.set(callId, session);
+    this.activeBridgesByCallId.set(callSid, session);
+    const sendAudioToSession = session.sendAudio.bind(session);
+    session.sendAudio = (audio) => {
+      if (speechDetector.accept(audio)) {
+        audioPacer.clearAudio();
+      }
+      sendAudioToSession(audio);
+    };
+    const closeSession = session.close.bind(session);
+    session.close = () => {
      this.activeBridgesByCallId.delete(callId);
      this.activeBridgesByCallId.delete(callSid);
      this.partialUserTranscriptsByCallId.delete(callId);
-      closeBridge();
+      audioPacer.close();
+      closeSession();
    };

-    bridge.connect().catch((error: Error) => {
+    session.connect().catch((error: Error) => {
      console.error("[voice-call] Failed to connect realtime bridge:", error);
-      bridge.close();
+      session.close();
      emitCallEnd("error");
      ws.close(1011, "Failed to connect");
    });

-    return bridge;
+    return session;
  }

  private registerCallInManager(