fix(openai): harden realtime stt

2026-05-06 16:01:01 +00:00 · 2026-04-23 02:21:42 +01:00
parent 26bf916382
commit 4ff720a837
6 changed files with 208 additions and 5 deletions
--- a/extensions/openai/openai.live.test.ts
+++ b/extensions/openai/openai.live.test.ts
@@ -140,6 +140,56 @@ async function createTempAgentDir(): Promise<string> {
  return await fs.mkdtemp(path.join(os.tmpdir(), "openai-plugin-live-"));
 }

+async function waitForLiveExpectation(expectation: () => void, timeoutMs = 30_000) {
+  const started = Date.now();
+  let lastError: unknown;
+  while (Date.now() - started < timeoutMs) {
+    try {
+      expectation();
+      return;
+    } catch (error) {
+      lastError = error;
+      await new Promise((resolve) => setTimeout(resolve, 100));
+    }
+  }
+  throw lastError;
+}
+
+function normalizeTranscriptForMatch(value: string): string {
+  return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
+}
+
+function linearToMulaw(sample: number): number {
+  const bias = 132;
+  const clip = 32635;
+  let next = Math.max(-clip, Math.min(clip, sample));
+  const sign = next < 0 ? 0x80 : 0;
+  if (next < 0) {
+    next = -next;
+  }
+
+  next += bias;
+  let exponent = 7;
+  for (let expMask = 0x4000; (next & expMask) === 0 && exponent > 0; exponent -= 1) {
+    expMask >>= 1;
+  }
+
+  const mantissa = (next >> (exponent + 3)) & 0x0f;
+  return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
+
+function convertPcm24kToMulaw8k(pcm: Buffer): Buffer {
+  const inputSamples = Math.floor(pcm.length / 2);
+  const outputSamples = Math.floor(inputSamples / 3);
+  const mulaw = Buffer.alloc(outputSamples);
+
+  for (let i = 0; i < outputSamples; i += 1) {
+    mulaw[i] = linearToMulaw(pcm.readInt16LE(i * 3 * 2));
+  }
+
+  return mulaw;
+}
+
 describeLive("openai plugin live", () => {
  it("registers an OpenAI provider that can complete a live request", async () => {
    const { providers } = await registerOpenAIPlugin();
@@ -247,6 +297,89 @@ describeLive("openai plugin live", () => {
    expect(text).toMatch(/\bok\b/);
  }, 45_000);

+  it("opens OpenAI realtime STT before sending audio", async () => {
+    const { realtimeTranscriptionProviders } = await registerOpenAIPlugin();
+    const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
+    const errors: Error[] = [];
+    const session = realtimeProvider.createSession({
+      providerConfig: {
+        apiKey: OPENAI_API_KEY,
+        language: "en",
+      },
+      onError: (error) => errors.push(error),
+    });
+
+    try {
+      await session.connect();
+      await new Promise((resolve) => setTimeout(resolve, 1_000));
+      expect(errors).toEqual([]);
+      expect(session.isConnected()).toBe(true);
+    } finally {
+      session.close();
+    }
+  }, 30_000);
+
+  it("streams realtime STT through the registered transcription provider", async () => {
+    const { realtimeTranscriptionProviders, speechProviders } = await registerOpenAIPlugin();
+    const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
+    const speechProvider = requireRegisteredProvider(speechProviders, "openai");
+    const cfg = createLiveConfig();
+    const ttsConfig = createLiveTtsConfig();
+    const phrase = "Testing OpenClaw OpenAI realtime transcription integration test OK.";
+
+    const telephony = await speechProvider.synthesizeTelephony?.({
+      text: phrase,
+      cfg,
+      providerConfig: ttsConfig.providerConfigs.openai ?? {},
+      timeoutMs: ttsConfig.timeoutMs,
+    });
+    if (!telephony) {
+      throw new Error("OpenAI telephony synthesis did not return audio");
+    }
+    expect(telephony.outputFormat).toBe("pcm");
+    expect(telephony.sampleRate).toBe(24_000);
+
+    const transcripts: string[] = [];
+    const partials: string[] = [];
+    const errors: Error[] = [];
+    const session = realtimeProvider.createSession({
+      providerConfig: {
+        apiKey: OPENAI_API_KEY,
+        language: "en",
+        silenceDurationMs: 500,
+      },
+      onPartial: (partial) => partials.push(partial),
+      onTranscript: (transcript) => transcripts.push(transcript),
+      onError: (error) => errors.push(error),
+    });
+
+    try {
+      await session.connect();
+      const speech = convertPcm24kToMulaw8k(telephony.audioBuffer);
+      const silence = Buffer.alloc(8_000, 0xff);
+      const audio = Buffer.concat([silence.subarray(0, 4_000), speech, silence]);
+      for (let offset = 0; offset < audio.byteLength; offset += 160) {
+        session.sendAudio(audio.subarray(offset, offset + 160));
+        await new Promise((resolve) => setTimeout(resolve, 5));
+      }
+
+      await waitForLiveExpectation(() => {
+        if (errors[0]) {
+          throw errors[0];
+        }
+        expect(normalizeTranscriptForMatch(transcripts.join(" "))).toContain("openclaw");
+      }, 60_000);
+    } finally {
+      session.close();
+    }
+
+    const normalized = transcripts.join(" ").toLowerCase();
+    const compact = normalizeTranscriptForMatch(normalized);
+    expect(compact).toContain("openclaw");
+    expect(normalized).toContain("transcription");
+    expect(partials.length + transcripts.length).toBeGreaterThan(0);
+  }, 180_000);
+
  it("generates an image through the registered image provider", async () => {
    const { imageProviders } = await registerOpenAIPlugin();
    const imageProvider = requireRegisteredProvider(imageProviders, "openai");
--- a/extensions/openai/realtime-transcription-provider.test.ts
+++ b/extensions/openai/realtime-transcription-provider.test.ts
@@ -27,7 +27,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
      rawConfig: {
        providers: {
          openai: {
+            language: "en",
            model: "gpt-4o-transcribe",
+            prompt: "expect OpenClaw product names",
            silenceDurationMs: 900,
            vadThreshold: 0.45,
          },
@@ -36,7 +38,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
    });

    expect(resolved).toEqual({
+      language: "en",
      model: "gpt-4o-transcribe",
+      prompt: "expect OpenClaw product names",
      silenceDurationMs: 900,
      vadThreshold: 0.45,
    });
--- a/extensions/openai/realtime-transcription-provider.ts
+++ b/extensions/openai/realtime-transcription-provider.ts
@@ -22,14 +22,18 @@ import {

 type OpenAIRealtimeTranscriptionProviderConfig = {
  apiKey?: string;
+  language?: string;
  model?: string;
+  prompt?: string;
  silenceDurationMs?: number;
  vadThreshold?: number;
 };

 type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
  apiKey: string;
+  language?: string;
  model: string;
+  prompt?: string;
  silenceDurationMs: number;
  vadThreshold: number;
 };
@@ -55,7 +59,9 @@ function normalizeProviderConfig(
        value: raw?.openaiApiKey,
        path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
      }),
+    language: trimToUndefined(raw?.language),
    model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
+    prompt: trimToUndefined(raw?.prompt),
    silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
    vadThreshold: asFiniteNumber(raw?.vadThreshold),
  };
@@ -141,6 +147,8 @@ class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession
            input_audio_format: "g711_ulaw",
            input_audio_transcription: {
              model: this.config.model,
+              ...(this.config.language ? { language: this.config.language } : {}),
+              ...(this.config.prompt ? { prompt: this.config.prompt } : {}),
            },
            turn_detection: {
              type: "server_vad",
@@ -301,7 +309,9 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio
      return new OpenAIRealtimeTranscriptionSession({
        ...req,
        apiKey,
+        language: config.language,
        model: config.model ?? "gpt-4o-transcribe",
+        prompt: config.prompt,
        silenceDurationMs: config.silenceDurationMs ?? 800,
        vadThreshold: config.vadThreshold ?? 0.5,
      });