From 4ff720a83758906a034bed77f0e5875ee59c18f5 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Thu, 23 Apr 2026 02:21:42 +0100
Subject: [PATCH] fix(openai): harden realtime stt

---
 CHANGELOG.md                                  |   1 +
 docs/providers/openai.md                      |  55 +++++++-
 docs/tools/media-overview.md                  |  10 +-
 extensions/openai/openai.live.test.ts         | 133 ++++++++++++++++++
 .../realtime-transcription-provider.test.ts   |   4 +
 .../openai/realtime-transcription-provider.ts |  10 ++
 6 files changed, 208 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d31c2e0bc73..befe0f09bba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Providers/OpenAI: harden Voice Call realtime transcription against OpenAI Realtime session-update drift, forward language and prompt hints, and add live coverage for realtime STT.
 - Providers/Moonshot: stop strict-sanitizing Kimi's native tool_call IDs (shaped like `functions.<name>:<index>`) on the OpenAI-compatible transport, so multi-turn agentic flows through Kimi K2.6 no longer break after 2-3 tool-calling rounds when the serving layer fails to match mangled IDs against the original tool definitions. Adds a `sanitizeToolCallIds` opt-out to the shared `openai-compatible` replay family helper and wires Moonshot to it. Fixes #62319. (#70030) Thanks @LeoDu0314.
 - Dependencies/security: override transitive `uuid` to `14.0.0`, clearing the runtime advisory across dependencies.
 - Codex harness: ignore dynamic tool descriptions when deciding whether to reuse a native app-server thread while still fingerprinting tool schemas, so channel-specific copy changes no longer reset otherwise compatible Codex conversations. (#69976) Thanks @chen-zhang-cs-code.
diff --git a/docs/providers/openai.md b/docs/providers/openai.md
index 5b4cf064d78..94cf7eef23c 100644
--- a/docs/providers/openai.md
+++ b/docs/providers/openai.md
@@ -16,6 +16,21 @@ OpenAI provides developer APIs for GPT models. OpenClaw supports two auth routes
 
 OpenAI explicitly supports subscription OAuth usage in external tools and workflows like OpenClaw.
 
+## OpenClaw feature coverage
+
+| OpenAI capability         | OpenClaw surface                          | Status                                                 |
+| ------------------------- | ----------------------------------------- | ------------------------------------------------------ |
+| Chat / Responses          | `openai/<model>` model provider           | Yes                                                    |
+| Codex subscription models | `openai-codex/<model>` model provider     | Yes                                                    |
+| Server-side web search    | Native OpenAI Responses tool              | Yes, when web search is enabled and no provider pinned |
+| Images                    | `image_generate`                          | Yes                                                    |
+| Videos                    | `video_generate`                          | Yes                                                    |
+| Text-to-speech            | `messages.tts.provider: "openai"` / `tts` | Yes                                                    |
+| Batch speech-to-text      | `tools.media.audio` / media understanding | Yes                                                    |
+| Streaming speech-to-text  | Voice Call `streaming.provider: "openai"` | Yes                                                    |
+| Realtime voice            | Voice Call `realtime.provider: "openai"`  | Yes                                                    |
+| Embeddings                | memory embedding provider                 | Yes                                                    |
+
 ## Getting started
 
 Choose your preferred auth method and follow the setup steps.
@@ -299,18 +314,56 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
 
   </Accordion>
 
+  <Accordion title="Speech-to-text">
+    The bundled `openai` plugin registers batch speech-to-text through
+    OpenClaw's media-understanding transcription surface.
+
+    - Default model: `gpt-4o-transcribe`
+    - Endpoint: OpenAI REST `/v1/audio/transcriptions`
+    - Input path: multipart audio file upload
+    - Supported by OpenClaw wherever inbound audio transcription uses
+      `tools.media.audio`, including Discord voice-channel segments and channel
+      audio attachments
+
+    To force OpenAI for inbound audio transcription:
+
+    ```json5
+    {
+      tools: {
+        media: {
+          audio: {
+            models: [
+              {
+                type: "provider",
+                provider: "openai",
+                model: "gpt-4o-transcribe",
+              },
+            ],
+          },
+        },
+      },
+    }
+    ```
+
+    Language and prompt hints are forwarded to OpenAI when supplied by the
+    shared audio media config or per-call transcription request.
+
+  </Accordion>
+
   <Accordion title="Realtime transcription">
     The bundled `openai` plugin registers realtime transcription for the Voice Call plugin.
 
     | Setting | Config path | Default |
     |---------|------------|---------|
     | Model | `plugins.entries.voice-call.config.streaming.providers.openai.model` | `gpt-4o-transcribe` |
+    | Language | `...openai.language` | (unset) |
+    | Prompt | `...openai.prompt` | (unset) |
     | Silence duration | `...openai.silenceDurationMs` | `800` |
     | VAD threshold | `...openai.vadThreshold` | `0.5` |
     | API key | `...openai.apiKey` | Falls back to `OPENAI_API_KEY` |
 
     <Note>
-    Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law audio.
+    Uses a WebSocket connection to `wss://api.openai.com/v1/realtime` with G.711 u-law (`g711_ulaw` / `audio/pcmu`) audio. This streaming provider is for Voice Call's realtime transcription path; Discord voice currently records short segments and uses the batch `tools.media.audio` transcription path instead.
     </Note>
 
   </Accordion>
diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md
index 302af1a5932..8bcb923c90b 100644
--- a/docs/tools/media-overview.md
+++ b/docs/tools/media-overview.md
@@ -51,10 +51,12 @@ Media understanding uses any vision-capable or audio-capable model registered in
 
 Video and music generation run as background tasks because provider processing typically takes 30 seconds to several minutes. When the agent calls `video_generate` or `music_generate`, OpenClaw submits the request to the provider, returns a task ID immediately, and tracks the job in the task ledger. The agent continues responding to other messages while the job runs. When the provider finishes, OpenClaw wakes the agent so it can post the finished media back into the original channel. Image generation and TTS are synchronous and complete inline with the reply.
 
-xAI currently maps to OpenClaw's image, video, search, code-execution, batch
-TTS, batch STT, and Voice Call streaming STT surfaces. xAI Realtime voice is
-an upstream capability, but it is not registered in OpenClaw until the shared
-realtime voice contract can represent it.
+OpenAI maps to OpenClaw's image, video, batch TTS, batch STT, Voice Call
+streaming STT, realtime voice, and memory embedding surfaces. xAI currently
+maps to OpenClaw's image, video, search, code-execution, batch TTS, batch STT,
+and Voice Call streaming STT surfaces. xAI Realtime voice is an upstream
+capability, but it is not registered in OpenClaw until the shared realtime
+voice contract can represent it.
 
 ## Quick links
 
diff --git a/extensions/openai/openai.live.test.ts b/extensions/openai/openai.live.test.ts
index a6f0941bd2b..0f9334d3158 100644
--- a/extensions/openai/openai.live.test.ts
+++ b/extensions/openai/openai.live.test.ts
@@ -140,6 +140,56 @@ async function createTempAgentDir(): Promise<string> {
   return await fs.mkdtemp(path.join(os.tmpdir(), "openai-plugin-live-"));
 }
 
+async function waitForLiveExpectation(expectation: () => void, timeoutMs = 30_000) {
+  const started = Date.now();
+  let lastError: unknown;
+  while (Date.now() - started < timeoutMs) {
+    try {
+      expectation();
+      return;
+    } catch (error) {
+      lastError = error;
+      await new Promise((resolve) => setTimeout(resolve, 100));
+    }
+  }
+  throw lastError;
+}
+
+function normalizeTranscriptForMatch(value: string): string {
+  return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
+}
+
+function linearToMulaw(sample: number): number {
+  const bias = 132;
+  const clip = 32635;
+  let next = Math.max(-clip, Math.min(clip, sample));
+  const sign = next < 0 ? 0x80 : 0;
+  if (next < 0) {
+    next = -next;
+  }
+
+  next += bias;
+  let exponent = 7;
+  for (let expMask = 0x4000; (next & expMask) === 0 && exponent > 0; exponent -= 1) {
+    expMask >>= 1;
+  }
+
+  const mantissa = (next >> (exponent + 3)) & 0x0f;
+  return ~(sign | (exponent << 4) | mantissa) & 0xff;
+}
+
+function convertPcm24kToMulaw8k(pcm: Buffer): Buffer {
+  const inputSamples = Math.floor(pcm.length / 2);
+  const outputSamples = Math.floor(inputSamples / 3);
+  const mulaw = Buffer.alloc(outputSamples);
+
+  for (let i = 0; i < outputSamples; i += 1) {
+    mulaw[i] = linearToMulaw(pcm.readInt16LE(i * 3 * 2));
+  }
+
+  return mulaw;
+}
+
 describeLive("openai plugin live", () => {
   it("registers an OpenAI provider that can complete a live request", async () => {
     const { providers } = await registerOpenAIPlugin();
@@ -247,6 +297,89 @@ describeLive("openai plugin live", () => {
     expect(text).toMatch(/\bok\b/);
   }, 45_000);
 
+  it("opens OpenAI realtime STT before sending audio", async () => {
+    const { realtimeTranscriptionProviders } = await registerOpenAIPlugin();
+    const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
+    const errors: Error[] = [];
+    const session = realtimeProvider.createSession({
+      providerConfig: {
+        apiKey: OPENAI_API_KEY,
+        language: "en",
+      },
+      onError: (error) => errors.push(error),
+    });
+
+    try {
+      await session.connect();
+      await new Promise((resolve) => setTimeout(resolve, 1_000));
+      expect(errors).toEqual([]);
+      expect(session.isConnected()).toBe(true);
+    } finally {
+      session.close();
+    }
+  }, 30_000);
+
+  it("streams realtime STT through the registered transcription provider", async () => {
+    const { realtimeTranscriptionProviders, speechProviders } = await registerOpenAIPlugin();
+    const realtimeProvider = requireRegisteredProvider(realtimeTranscriptionProviders, "openai");
+    const speechProvider = requireRegisteredProvider(speechProviders, "openai");
+    const cfg = createLiveConfig();
+    const ttsConfig = createLiveTtsConfig();
+    const phrase = "Testing OpenClaw OpenAI realtime transcription integration test OK.";
+
+    const telephony = await speechProvider.synthesizeTelephony?.({
+      text: phrase,
+      cfg,
+      providerConfig: ttsConfig.providerConfigs.openai ?? {},
+      timeoutMs: ttsConfig.timeoutMs,
+    });
+    if (!telephony) {
+      throw new Error("OpenAI telephony synthesis did not return audio");
+    }
+    expect(telephony.outputFormat).toBe("pcm");
+    expect(telephony.sampleRate).toBe(24_000);
+
+    const transcripts: string[] = [];
+    const partials: string[] = [];
+    const errors: Error[] = [];
+    const session = realtimeProvider.createSession({
+      providerConfig: {
+        apiKey: OPENAI_API_KEY,
+        language: "en",
+        silenceDurationMs: 500,
+      },
+      onPartial: (partial) => partials.push(partial),
+      onTranscript: (transcript) => transcripts.push(transcript),
+      onError: (error) => errors.push(error),
+    });
+
+    try {
+      await session.connect();
+      const speech = convertPcm24kToMulaw8k(telephony.audioBuffer);
+      const silence = Buffer.alloc(8_000, 0xff);
+      const audio = Buffer.concat([silence.subarray(0, 4_000), speech, silence]);
+      for (let offset = 0; offset < audio.byteLength; offset += 160) {
+        session.sendAudio(audio.subarray(offset, offset + 160));
+        await new Promise((resolve) => setTimeout(resolve, 5));
+      }
+
+      await waitForLiveExpectation(() => {
+        if (errors[0]) {
+          throw errors[0];
+        }
+        expect(normalizeTranscriptForMatch(transcripts.join(" "))).toContain("openclaw");
+      }, 60_000);
+    } finally {
+      session.close();
+    }
+
+    const normalized = transcripts.join(" ").toLowerCase();
+    const compact = normalizeTranscriptForMatch(normalized);
+    expect(compact).toContain("openclaw");
+    expect(normalized).toContain("transcription");
+    expect(partials.length + transcripts.length).toBeGreaterThan(0);
+  }, 180_000);
+
   it("generates an image through the registered image provider", async () => {
     const { imageProviders } = await registerOpenAIPlugin();
     const imageProvider = requireRegisteredProvider(imageProviders, "openai");
diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts
index 30b0ed6530e..97e141eeea0 100644
--- a/extensions/openai/realtime-transcription-provider.test.ts
+++ b/extensions/openai/realtime-transcription-provider.test.ts
@@ -27,7 +27,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
       rawConfig: {
         providers: {
           openai: {
+            language: "en",
             model: "gpt-4o-transcribe",
+            prompt: "expect OpenClaw product names",
             silenceDurationMs: 900,
             vadThreshold: 0.45,
           },
@@ -36,7 +38,9 @@ describe("buildOpenAIRealtimeTranscriptionProvider", () => {
     });
 
     expect(resolved).toEqual({
+      language: "en",
       model: "gpt-4o-transcribe",
+      prompt: "expect OpenClaw product names",
       silenceDurationMs: 900,
       vadThreshold: 0.45,
     });
diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts
index 80eaa0ca3e6..7208427d485 100644
--- a/extensions/openai/realtime-transcription-provider.ts
+++ b/extensions/openai/realtime-transcription-provider.ts
@@ -22,14 +22,18 @@ import {
 
 type OpenAIRealtimeTranscriptionProviderConfig = {
   apiKey?: string;
+  language?: string;
   model?: string;
+  prompt?: string;
   silenceDurationMs?: number;
   vadThreshold?: number;
 };
 
 type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
   apiKey: string;
+  language?: string;
   model: string;
+  prompt?: string;
   silenceDurationMs: number;
   vadThreshold: number;
 };
@@ -55,7 +59,9 @@ function normalizeProviderConfig(
         value: raw?.openaiApiKey,
         path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
       }),
+    language: trimToUndefined(raw?.language),
     model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
+    prompt: trimToUndefined(raw?.prompt),
     silenceDurationMs: asFiniteNumber(raw?.silenceDurationMs),
     vadThreshold: asFiniteNumber(raw?.vadThreshold),
   };
@@ -141,6 +147,8 @@ class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession
             input_audio_format: "g711_ulaw",
             input_audio_transcription: {
               model: this.config.model,
+              ...(this.config.language ? { language: this.config.language } : {}),
+              ...(this.config.prompt ? { prompt: this.config.prompt } : {}),
             },
             turn_detection: {
               type: "server_vad",
@@ -301,7 +309,9 @@ export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptio
       return new OpenAIRealtimeTranscriptionSession({
         ...req,
         apiKey,
+        language: config.language,
         model: config.model ?? "gpt-4o-transcribe",
+        prompt: config.prompt,
         silenceDurationMs: config.silenceDurationMs ?? 800,
         vadThreshold: config.vadThreshold ?? 0.5,
       });